mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-09 06:41:48 +08:00
Feat: Markdown add image (#7124)
### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/6984 1. Markdown parser supports get pictures 2. For Native, when handling Markdown, it will handle images 3. improve merge and ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
parent
fef44a71c5
commit
1662c7eda3
@ -22,7 +22,7 @@ from timeit import default_timer as timer
|
|||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
||||||
from markdown import markdown
|
import markdown
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tika import parser
|
from tika import parser
|
||||||
|
|
||||||
@ -31,7 +31,7 @@ from api.db.services.llm_service import LLMBundle
|
|||||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
|
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
|
||||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
|
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
|
||||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
|
|
||||||
|
|
||||||
@ -287,6 +287,41 @@ class Pdf(PdfParser):
|
|||||||
|
|
||||||
|
|
||||||
class Markdown(MarkdownParser):
|
class Markdown(MarkdownParser):
|
||||||
|
def get_picture_urls(self, sections):
|
||||||
|
if not sections:
|
||||||
|
return []
|
||||||
|
if isinstance(sections, type("")):
|
||||||
|
text = sections
|
||||||
|
elif isinstance(sections[0], type("")):
|
||||||
|
text = sections[0]
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
md = markdown.Markdown()
|
||||||
|
html_content = md.convert(text)
|
||||||
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
|
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
||||||
|
return html_images
|
||||||
|
|
||||||
|
def get_pictures(self, text):
|
||||||
|
"""Download and open all images from markdown text."""
|
||||||
|
import requests
|
||||||
|
image_urls = self.get_picture_urls(text)
|
||||||
|
images = []
|
||||||
|
# Find all image URLs in text
|
||||||
|
for url in image_urls:
|
||||||
|
try:
|
||||||
|
response = requests.get(url, stream=True, timeout=30)
|
||||||
|
if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
|
||||||
|
img = Image.open(BytesIO(response.content)).convert('RGB')
|
||||||
|
images.append(img)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return images if images else None
|
||||||
|
|
||||||
def __call__(self, filename, binary=None):
|
def __call__(self, filename, binary=None):
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
@ -335,6 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
pdf_parser = None
|
pdf_parser = None
|
||||||
|
section_images = None
|
||||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
|
||||||
@ -368,7 +404,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
if kwargs.get("section_only", False):
|
if kwargs.get("section_only", False):
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
res.extend(tokenize_chunks_docx(chunks, doc, is_english, images))
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
||||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@ -432,7 +468,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary)
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||||
|
sections, tables = markdown_parser(filename, binary)
|
||||||
|
|
||||||
|
# Process images for each section
|
||||||
|
section_images = []
|
||||||
|
for section_text, _ in sections:
|
||||||
|
images = markdown_parser.get_pictures(section_text) if section_text else None
|
||||||
|
if images:
|
||||||
|
# If multiple images found, combine them using concat_img
|
||||||
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
||||||
|
section_images.append(combined_image)
|
||||||
|
else:
|
||||||
|
section_images.append(None)
|
||||||
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
res = tokenize_table(tables, doc, is_english)
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
@ -467,14 +516,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||||
|
|
||||||
st = timer()
|
st = timer()
|
||||||
chunks = naive_merge(
|
if section_images:
|
||||||
sections, int(parser_config.get(
|
# if all images are None, set section_images to None
|
||||||
"chunk_token_num", 128)), parser_config.get(
|
if all(image is None for image in section_images):
|
||||||
"delimiter", "\n!?。;!?"))
|
section_images = None
|
||||||
if kwargs.get("section_only", False):
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
if section_images:
|
||||||
|
chunks, images = naive_merge_with_images(sections, section_images,
|
||||||
|
int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
||||||
|
else:
|
||||||
|
chunks = naive_merge(
|
||||||
|
sections, int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||||
|
|
||||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
@ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
|||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
def tokenize_chunks_with_images(chunks, doc, eng, images):
|
||||||
def tokenize_chunks_docx(chunks, doc, eng, images):
|
|
||||||
res = []
|
res = []
|
||||||
# wrap up as es documents
|
# wrap up as es documents
|
||||||
for ck, image in zip(chunks, images):
|
for ck, image in zip(chunks, images):
|
||||||
@ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images):
|
|||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
def tokenize_table(tbls, doc, eng, batch_size=10):
|
def tokenize_table(tbls, doc, eng, batch_size=10):
|
||||||
res = []
|
res = []
|
||||||
# add tables
|
# add tables
|
||||||
@ -539,7 +537,46 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|||||||
add_chunk(sec, pos)
|
add_chunk(sec, pos)
|
||||||
|
|
||||||
return cks
|
return cks
|
||||||
|
|
||||||
|
|
||||||
|
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
|
||||||
|
if not texts or len(texts) != len(images):
|
||||||
|
return [], []
|
||||||
|
# Enuser texts is str not tuple, if it is tuple, convert to str (get the first item)
|
||||||
|
if isinstance(texts[0], tuple):
|
||||||
|
texts = [t[0] for t in texts]
|
||||||
|
cks = [""]
|
||||||
|
result_images = [None]
|
||||||
|
tk_nums = [0]
|
||||||
|
|
||||||
|
def add_chunk(t, image, pos=""):
|
||||||
|
nonlocal cks, result_images, tk_nums, delimiter
|
||||||
|
tnum = num_tokens_from_string(t)
|
||||||
|
if not pos:
|
||||||
|
pos = ""
|
||||||
|
if tnum < 8:
|
||||||
|
pos = ""
|
||||||
|
# Ensure that the length of the merged chunk does not exceed chunk_token_num
|
||||||
|
if tk_nums[-1] > chunk_token_num:
|
||||||
|
if t.find(pos) < 0:
|
||||||
|
t += pos
|
||||||
|
cks.append(t)
|
||||||
|
result_images.append(image)
|
||||||
|
tk_nums.append(tnum)
|
||||||
|
else:
|
||||||
|
if cks[-1].find(pos) < 0:
|
||||||
|
t += pos
|
||||||
|
cks[-1] += t
|
||||||
|
if result_images[-1] is None:
|
||||||
|
result_images[-1] = image
|
||||||
|
else:
|
||||||
|
result_images[-1] = concat_img(result_images[-1], image)
|
||||||
|
tk_nums[-1] += tnum
|
||||||
|
|
||||||
|
for text, image in zip(texts, images):
|
||||||
|
add_chunk(text, image)
|
||||||
|
|
||||||
|
return cks, result_images
|
||||||
|
|
||||||
def docx_question_level(p, bull=-1):
|
def docx_question_level(p, bull=-1):
|
||||||
txt = re.sub(r"\u3000", " ", p.text).strip()
|
txt = re.sub(r"\u3000", " ", p.text).strip()
|
||||||
|
@ -103,7 +103,7 @@ export interface IChunk {
|
|||||||
content_with_weight: string;
|
content_with_weight: string;
|
||||||
doc_id: string;
|
doc_id: string;
|
||||||
doc_name: string;
|
doc_name: string;
|
||||||
img_id: string;
|
image_id: string;
|
||||||
important_kwd?: string[];
|
important_kwd?: string[];
|
||||||
question_kwd?: string[]; // keywords
|
question_kwd?: string[]; // keywords
|
||||||
tag_kwd?: string[];
|
tag_kwd?: string[];
|
||||||
|
@ -64,14 +64,14 @@ const ChunkCard = ({
|
|||||||
>
|
>
|
||||||
<Flex gap={'middle'} justify={'space-between'}>
|
<Flex gap={'middle'} justify={'space-between'}>
|
||||||
<Checkbox onChange={handleCheck} checked={checked}></Checkbox>
|
<Checkbox onChange={handleCheck} checked={checked}></Checkbox>
|
||||||
{item.img_id && (
|
{item.image_id && (
|
||||||
<Popover
|
<Popover
|
||||||
placement="right"
|
placement="right"
|
||||||
content={
|
content={
|
||||||
<Image id={item.img_id} className={styles.imagePreview}></Image>
|
<Image id={item.image_id} className={styles.imagePreview}></Image>
|
||||||
}
|
}
|
||||||
>
|
>
|
||||||
<Image id={item.img_id} className={styles.image}></Image>
|
<Image id={item.image_id} className={styles.image}></Image>
|
||||||
</Popover>
|
</Popover>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user