From 1662c7eda397d07f33eb8928271a3eb3845cce4c Mon Sep 17 00:00:00 2001
From: Stephen Hu <stephenhu@seismic.com>
Date: Fri, 25 Apr 2025 18:35:28 +0800
Subject: [PATCH] Feat: Markdown add image (#7124)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/6984

1. Markdown parser supports get pictures
2. For Native, when handling Markdown, it will handle images
3. improve merge and

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
---
 rag/app/naive.py                              | 87 ++++++++++++++++---
 rag/nlp/__init__.py                           | 43 ++++++++-
 web/src/interfaces/database/knowledge.ts      |  2 +-
 .../components/chunk-card/index.tsx           |  6 +-
 4 files changed, 120 insertions(+), 18 deletions(-)

diff --git a/rag/app/naive.py b/rag/app/naive.py
index a3c629e1e..dcaea240c 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -22,7 +22,7 @@ from timeit import default_timer as timer
 
 from docx import Document
 from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
-from markdown import markdown
+import markdown 
 from PIL import Image
 from tika import parser
 
@@ -31,7 +31,7 @@ from api.db.services.llm_service import LLMBundle
 from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
 from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
-from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
+from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
 from rag.utils import num_tokens_from_string
 
 
@@ -287,6 +287,41 @@ class Pdf(PdfParser):
 
 
 class Markdown(MarkdownParser):
+    def get_picture_urls(self, sections):
+        if not sections:
+            return []
+        if isinstance(sections, type("")):
+            text = sections
+        elif isinstance(sections[0], type("")):
+            text = sections[0]
+        else:
+            return []
+        
+        from bs4 import BeautifulSoup
+        md = markdown.Markdown()
+        html_content = md.convert(text)
+        soup = BeautifulSoup(html_content, 'html.parser')
+        html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
+        return html_images
+    
+    def get_pictures(self, text):
+        """Download and open all images from markdown text."""
+        import requests
+        image_urls = self.get_picture_urls(text)
+        images = []
+        # Find all image URLs in text
+        for url in image_urls:
+            try:
+                response = requests.get(url, stream=True, timeout=30)
+                if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
+                    img = Image.open(BytesIO(response.content)).convert('RGB')
+                    images.append(img)
+            except Exception as e:
+                logging.error(f"Failed to download/open image from {url}: {e}")
+                continue
+                    
+        return images if images else None
+
     def __call__(self, filename, binary=None):
         if binary:
             encoding = find_codec(binary)
@@ -335,6 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
     res = []
     pdf_parser = None
+    section_images = None
     if re.search(r"\.docx$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
 
@@ -368,7 +404,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
         if kwargs.get("section_only", False):
             return chunks
 
-        res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
         logging.info("naive_merge({}): {}".format(filename, timer() - st))
         return res
 
@@ -432,7 +468,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
 
     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
-        sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
+        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
+        sections, tables = markdown_parser(filename, binary)
+        
+        # Process images for each section
+        section_images = []
+        for section_text, _ in sections:
+            images = markdown_parser.get_pictures(section_text) if section_text else None
+            if images:
+                # If multiple images found, combine them using concat_img
+                combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
+                section_images.append(combined_image)
+            else:
+                section_images.append(None)
+                
         res = tokenize_table(tables, doc, is_english)
         callback(0.8, "Finish parsing.")
 
@@ -467,14 +516,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
 
     st = timer()
-    chunks = naive_merge(
-        sections, int(parser_config.get(
-            "chunk_token_num", 128)), parser_config.get(
-            "delimiter", "\n!?。；！？"))
-    if kwargs.get("section_only", False):
-        return chunks
+    if section_images:
+        # if all images are None, set section_images to None
+        if all(image is None for image in section_images):
+            section_images = None
 
-    res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+    if section_images:
+        chunks, images = naive_merge_with_images(sections, section_images,
+                                        int(parser_config.get(
+                                            "chunk_token_num", 128)), parser_config.get(
+                                            "delimiter", "\n!?。；！？"))
+        if kwargs.get("section_only", False):
+            return chunks
+        
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+    else:
+        chunks = naive_merge(
+            sections, int(parser_config.get(
+                "chunk_token_num", 128)), parser_config.get(
+                "delimiter", "\n!?。；！？"))
+        if kwargs.get("section_only", False):
+            return chunks
+
+        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+    
     logging.info("naive_merge({}): {}".format(filename, timer() - st))
     return res
 
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 171cb0911..a1edb3fe7 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
         res.append(d)
     return res
 
-
-def tokenize_chunks_docx(chunks, doc, eng, images):
+def tokenize_chunks_with_images(chunks, doc, eng, images):
     res = []
     # wrap up as es documents
     for ck, image in zip(chunks, images):
@@ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
         res.append(d)
     return res
 
-
 def tokenize_table(tbls, doc, eng, batch_size=10):
     res = []
     # add tables
@@ -539,7 +537,46 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
         add_chunk(sec, pos)
 
     return cks
+    
 
+def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。；！？"):
+    if not texts or len(texts) != len(images):
+        return [], []
+    # Enuser texts is str not tuple, if it is tuple, convert to str (get the first item)
+    if isinstance(texts[0], tuple):
+        texts = [t[0] for t in texts]
+    cks = [""]
+    result_images = [None]
+    tk_nums = [0]
+
+    def add_chunk(t, image, pos=""):
+        nonlocal cks, result_images, tk_nums, delimiter
+        tnum = num_tokens_from_string(t)
+        if not pos:
+            pos = ""
+        if tnum < 8:
+            pos = ""
+        # Ensure that the length of the merged chunk does not exceed chunk_token_num
+        if tk_nums[-1] > chunk_token_num:
+            if t.find(pos) < 0:
+                t += pos
+            cks.append(t)
+            result_images.append(image)
+            tk_nums.append(tnum)
+        else:
+            if cks[-1].find(pos) < 0:
+                t += pos
+            cks[-1] += t
+            if result_images[-1] is None:
+                result_images[-1] = image
+            else:
+                result_images[-1] = concat_img(result_images[-1], image)
+            tk_nums[-1] += tnum
+
+    for text, image in zip(texts, images):
+        add_chunk(text, image)
+
+    return cks, result_images
 
 def docx_question_level(p, bull=-1):
     txt = re.sub(r"\u3000", " ", p.text).strip()
diff --git a/web/src/interfaces/database/knowledge.ts b/web/src/interfaces/database/knowledge.ts
index e0f0b5a58..a479aa625 100644
--- a/web/src/interfaces/database/knowledge.ts
+++ b/web/src/interfaces/database/knowledge.ts
@@ -103,7 +103,7 @@ export interface IChunk {
   content_with_weight: string;
   doc_id: string;
   doc_name: string;
-  img_id: string;
+  image_id: string;
   important_kwd?: string[];
   question_kwd?: string[]; // keywords
   tag_kwd?: string[];
diff --git a/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx b/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx
index 5934eb309..b7e61e06a 100644
--- a/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx
@@ -64,14 +64,14 @@ const ChunkCard = ({
     >
       <Flex gap={'middle'} justify={'space-between'}>
         <Checkbox onChange={handleCheck} checked={checked}></Checkbox>
-        {item.img_id && (
+        {item.image_id && (
           <Popover
             placement="right"
             content={
-              <Image id={item.img_id} className={styles.imagePreview}></Image>
+              <Image id={item.image_id} className={styles.imagePreview}></Image>
             }
           >
-            <Image id={item.img_id} className={styles.image}></Image>
+            <Image id={item.image_id} className={styles.image}></Image>
           </Popover>
         )}