From a95c1d45f03687be131a9453d543c7e7e63f31a6 Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Thu, 27 Jun 2024 14:38:35 +0800 Subject: [PATCH] Support table for markdown file in general parser (#1278) ### What problem does this PR solve? Support extracting table for markdown file in general parser ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/__init__.py | 3 ++- deepdoc/parser/markdown_parser.py | 44 +++++++++++++++++++++++++++++++ rag/app/naive.py | 37 +++++++++++++++++++++++--- 3 files changed, 80 insertions(+), 4 deletions(-) create mode 100644 deepdoc/parser/markdown_parser.py diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index d925e68f5..d28e4ab76 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -16,4 +16,5 @@ from .docx_parser import RAGFlowDocxParser as DocxParser from .excel_parser import RAGFlowExcelParser as ExcelParser from .ppt_parser import RAGFlowPptParser as PptParser from .html_parser import RAGFlowHtmlParser as HtmlParser -from .json_parser import RAGFlowJsonParser as JsonParser \ No newline at end of file +from .json_parser import RAGFlowJsonParser as JsonParser +from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser \ No newline at end of file diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py new file mode 100644 index 000000000..60e6ba498 --- /dev/null +++ b/deepdoc/parser/markdown_parser.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re + +class RAGFlowMarkdownParser: + def __init__(self, chunk_token_num=128): + self.chunk_token_num = int(chunk_token_num) + + def extract_tables_and_remainder(self, markdown_text): + # Standard Markdown table + table_pattern = re.compile( + r''' + (?:\n|^) + (?:\|.*?\|.*?\|.*?\n) + (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) + (?:\|.*?\|.*?\|.*?\n)+ + ''', re.VERBOSE) + tables = table_pattern.findall(markdown_text) + remainder = table_pattern.sub('', markdown_text) + + # Borderless Markdown table + no_border_table_pattern = re.compile( + r''' + (?:\n|^) + (?:\S.*?\|.*?\n) + (?:(?:\s*[:-]+[-| :]*\s*).*?\n) + (?:\S.*?\|.*?\n)+ + ''', re.VERBOSE) + no_border_tables = no_border_table_pattern.findall(remainder) + tables.extend(no_border_tables) + remainder = no_border_table_pattern.sub('', remainder) + + return remainder, tables diff --git a/rag/app/naive.py b/rag/app/naive.py index 97f76112e..a22ebf619 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -17,12 +17,12 @@ from timeit import default_timer as timer import re from deepdoc.parser.pdf_parser import PlainParser from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx -from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser +from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser from rag.settings import cron_logger from rag.utils import num_tokens_from_string from PIL import Image from functools import reduce - +from markdown import markdown class Docx(DocxParser): def __init__(self): pass @@ -135,6 +135,31 @@ class Pdf(PdfParser): for b in self.boxes], tbls +class Markdown(MarkdownParser): + def __call__(self, filename, binary=None): + txt = "" + tbls = [] + if binary: + encoding = find_codec(binary) + txt = binary.decode(encoding, errors="ignore") + else: + with open(filename, "r") as f: + txt = f.read() + remainder, tables = self.extract_tables_and_remainder(f'{txt}\n') + sections = [] + tbls = [] + for sec in remainder.split("\n"): + if num_tokens_from_string(sec) > 10 * self.chunk_token_num: + sections.append((sec[:int(len(sec)/2)], "")) + sections.append((sec[int(len(sec)/2):], "")) + else: + sections.append((sec, "")) + print(tables) + for table in tables: + tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) + return sections, tbls + + def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ @@ -185,7 +210,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, excel_parser = ExcelParser() sections = [(l, "") for l in excel_parser.html(binary) if l] - elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): + elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = "" if binary: @@ -207,6 +232,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections.append((sec, "")) callback(0.8, "Finish parsing.") + + elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + sections, tbls = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) + res = tokenize_table(tbls, doc, eng) + callback(0.8, "Finish parsing.") elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.")