Feat: Markdown add image (#7124)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/6984

1. Markdown parser supports get pictures
2. For Native, when handling Markdown, it will handle images
3. improve merge and 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Stephen Hu 2025-04-25 18:35:28 +08:00 committed by GitHub
parent fef44a71c5
commit 1662c7eda3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 120 additions and 18 deletions

View File

@ -22,7 +22,7 @@ from timeit import default_timer as timer
from docx import Document from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown import markdown
from PIL import Image from PIL import Image
from tika import parser from tika import parser
@ -31,7 +31,7 @@ from api.db.services.llm_service import LLMBundle
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -287,6 +287,41 @@ class Pdf(PdfParser):
class Markdown(MarkdownParser): class Markdown(MarkdownParser):
def get_picture_urls(self, sections):
if not sections:
return []
if isinstance(sections, type("")):
text = sections
elif isinstance(sections[0], type("")):
text = sections[0]
else:
return []
from bs4 import BeautifulSoup
md = markdown.Markdown()
html_content = md.convert(text)
soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
image_urls = self.get_picture_urls(text)
images = []
# Find all image URLs in text
for url in image_urls:
try:
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
img = Image.open(BytesIO(response.content)).convert('RGB')
images.append(img)
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
continue
return images if images else None
def __call__(self, filename, binary=None): def __call__(self, filename, binary=None):
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
@ -335,6 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
pdf_parser = None pdf_parser = None
section_images = None
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
@ -368,7 +404,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if kwargs.get("section_only", False): if kwargs.get("section_only", False):
return chunks return chunks
res.extend(tokenize_chunks_docx(chunks, doc, is_english, images)) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res
@ -432,7 +468,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary)
# Process images for each section
section_images = []
for section_text, _ in sections:
images = markdown_parser.get_pictures(section_text) if section_text else None
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
section_images.append(combined_image)
else:
section_images.append(None)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
@ -467,14 +516,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)") "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer() st = timer()
chunks = naive_merge( if section_images:
sections, int(parser_config.get( # if all images are None, set section_images to None
"chunk_token_num", 128)), parser_config.get( if all(image is None for image in section_images):
"delimiter", "\n!?。;!?")) section_images = None
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False):
return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res

View File

@ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
res.append(d) res.append(d)
return res return res
def tokenize_chunks_with_images(chunks, doc, eng, images):
def tokenize_chunks_docx(chunks, doc, eng, images):
res = [] res = []
# wrap up as es documents # wrap up as es documents
for ck, image in zip(chunks, images): for ck, image in zip(chunks, images):
@ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
res.append(d) res.append(d)
return res return res
def tokenize_table(tbls, doc, eng, batch_size=10): def tokenize_table(tbls, doc, eng, batch_size=10):
res = [] res = []
# add tables # add tables
@ -539,7 +537,46 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
add_chunk(sec, pos) add_chunk(sec, pos)
return cks return cks
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
if not texts or len(texts) != len(images):
return [], []
# Enuser texts is str not tuple, if it is tuple, convert to str (get the first item)
if isinstance(texts[0], tuple):
texts = [t[0] for t in texts]
cks = [""]
result_images = [None]
tk_nums = [0]
def add_chunk(t, image, pos=""):
nonlocal cks, result_images, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if not pos:
pos = ""
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if tk_nums[-1] > chunk_token_num:
if t.find(pos) < 0:
t += pos
cks.append(t)
result_images.append(image)
tk_nums.append(tnum)
else:
if cks[-1].find(pos) < 0:
t += pos
cks[-1] += t
if result_images[-1] is None:
result_images[-1] = image
else:
result_images[-1] = concat_img(result_images[-1], image)
tk_nums[-1] += tnum
for text, image in zip(texts, images):
add_chunk(text, image)
return cks, result_images
def docx_question_level(p, bull=-1): def docx_question_level(p, bull=-1):
txt = re.sub(r"\u3000", " ", p.text).strip() txt = re.sub(r"\u3000", " ", p.text).strip()

View File

@ -103,7 +103,7 @@ export interface IChunk {
content_with_weight: string; content_with_weight: string;
doc_id: string; doc_id: string;
doc_name: string; doc_name: string;
img_id: string; image_id: string;
important_kwd?: string[]; important_kwd?: string[];
question_kwd?: string[]; // keywords question_kwd?: string[]; // keywords
tag_kwd?: string[]; tag_kwd?: string[];

View File

@ -64,14 +64,14 @@ const ChunkCard = ({
> >
<Flex gap={'middle'} justify={'space-between'}> <Flex gap={'middle'} justify={'space-between'}>
<Checkbox onChange={handleCheck} checked={checked}></Checkbox> <Checkbox onChange={handleCheck} checked={checked}></Checkbox>
{item.img_id && ( {item.image_id && (
<Popover <Popover
placement="right" placement="right"
content={ content={
<Image id={item.img_id} className={styles.imagePreview}></Image> <Image id={item.image_id} className={styles.imagePreview}></Image>
} }
> >
<Image id={item.img_id} className={styles.image}></Image> <Image id={item.image_id} className={styles.image}></Image>
</Popover> </Popover>
)} )}