From 5e4ac11df32f541488ec4639983a01202619c3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=81=B0=E7=81=B0?= Date: Thu, 25 Jul 2024 17:24:37 +0800 Subject: [PATCH] fix: code block segmentation problem of markdown document (#6465) --- api/core/rag/extractor/markdown_extractor.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/api/core/rag/extractor/markdown_extractor.py b/api/core/rag/extractor/markdown_extractor.py index faa1e64057..b24cf2e170 100644 --- a/api/core/rag/extractor/markdown_extractor.py +++ b/api/core/rag/extractor/markdown_extractor.py @@ -54,8 +54,16 @@ class MarkdownExtractor(BaseExtractor): current_header = None current_text = "" + code_block_flag = False for line in lines: + if line.startswith("```"): + code_block_flag = not code_block_flag + current_text += line + "\n" + continue + if code_block_flag: + current_text += line + "\n" + continue header_match = re.match(r"^#+\s", line) if header_match: if current_header is not None: