diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index c0d1c86d3..7b2074239 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -39,7 +39,7 @@ from api.utils import get_uuid from api.utils.api_utils import construct_json_result, construct_error_response from api.utils.api_utils import construct_result, validate_request from api.utils.file_utils import filename_type, thumbnail -from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio +from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email from rag.nlp import search from rag.utils.es_conn import ELASTICSEARCH from rag.utils.minio_conn import MINIO @@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id): table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) case "audio": audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) + case "email": + email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) case _: return False diff --git a/api/db/__init__.py b/api/db/__init__.py index 505d603b5..7be445cd9 100644 --- a/api/db/__init__.py +++ b/api/db/__init__.py @@ -85,6 +85,7 @@ class ParserType(StrEnum): PICTURE = "picture" ONE = "one" AUDIO = "audio" + EMAIL = "email" KG = "knowledge_graph" diff --git a/api/db/init_data.py b/api/db/init_data.py index 7ced350c7..d838966ba 100644 --- a/api/db/init_data.py +++ b/api/db/init_data.py @@ -122,7 +122,7 @@ def init_llm_factory(): LLMService.filter_delete([LLMService.model.fid == "QAnything"]) TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"}) TenantService.filter_update([1 == 1], { - "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"}) + "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"}) ## insert openai two embedding models to the current openai user. print("Start to insert 2 OpenAI embedding models...") tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()]) diff --git a/api/settings.py b/api/settings.py index 90efd13a0..6efc46ab7 100644 --- a/api/settings.py +++ b/api/settings.py @@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"] API_KEY = LLM.get("api_key", "") PARSERS = LLM.get( "parsers", - "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph") + "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email") # distribution DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index ca5a16c43..a4e84a8f3 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -156,7 +156,7 @@ def filename_type(filename): return FileType.PDF.value if re.match( - r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename): + r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename): return FileType.DOC.value if re.match( diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index d28e4ab76..67e5b5a8d 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -17,4 +17,5 @@ from .excel_parser import RAGFlowExcelParser as ExcelParser from .ppt_parser import RAGFlowPptParser as PptParser from .html_parser import RAGFlowHtmlParser as HtmlParser from .json_parser import RAGFlowJsonParser as JsonParser -from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser \ No newline at end of file +from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser +from .txt_parser import RAGFlowTxtParser as TxtParser \ No newline at end of file diff --git a/deepdoc/parser/html_parser.py b/deepdoc/parser/html_parser.py index ac2d3ef16..d5cde78c2 100644 --- a/deepdoc/parser/html_parser.py +++ b/deepdoc/parser/html_parser.py @@ -30,10 +30,15 @@ class RAGFlowHtmlParser: else: with open(fnm, "r",encoding=get_encoding(fnm)) as f: txt = f.read() + return self.parser_txt(txt) + @classmethod + def parser_txt(cls, txt): + if type(txt) != str: + raise TypeError("txt type should be str!") html_doc = readability.Document(txt) title = html_doc.title() content = html_text.extract_text(html_doc.summary(html_partial=True)) - txt = f'{title}\n{content}' + txt = f"{title}\n{content}" sections = txt.split("\n") return sections diff --git a/deepdoc/parser/txt_parser.py b/deepdoc/parser/txt_parser.py new file mode 100644 index 000000000..cd85ad6a7 --- /dev/null +++ b/deepdoc/parser/txt_parser.py @@ -0,0 +1,42 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from rag.nlp import find_codec,num_tokens_from_string + +class RAGFlowTxtParser: + def __call__(self, fnm, binary=None, chunk_token_num=128): + txt = "" + if binary: + encoding = find_codec(binary) + txt = binary.decode(encoding, errors="ignore") + else: + with open(fnm, "r") as f: + while True: + l = f.readline() + if not l: + break + txt += l + return self.parser_txt(txt, chunk_token_num) + + @classmethod + def parser_txt(cls, txt, chunk_token_num=128): + if type(txt) != str: + raise TypeError("txt type should be str!") + sections = [] + for sec in txt.split("\n"): + if num_tokens_from_string(sec) > 10 * int(chunk_token_num): + sections.append((sec[: int(len(sec) / 2)], "")) + sections.append((sec[int(len(sec) / 2) :], "")) + else: + sections.append((sec, "")) + return sections \ No newline at end of file diff --git a/rag/app/email.py b/rag/app/email.py new file mode 100644 index 000000000..9c843e6cf --- /dev/null +++ b/rag/app/email.py @@ -0,0 +1,114 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from email import policy +from email.parser import BytesParser +from rag.app.naive import chunk as naive_chunk +import re +from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks +from deepdoc.parser import HtmlParser, TxtParser +from timeit import default_timer as timer +from rag.settings import cron_logger +import io + + +def chunk( + filename, + binary=None, + from_page=0, + to_page=100000, + lang="Chinese", + callback=None, + **kwargs, +): + """ + Only eml is supported + """ + eng = lang.lower() == "english" # is_english(cks) + parser_config = kwargs.get( + "parser_config", + {"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}, + ) + doc = { + "docnm_kwd": filename, + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), + } + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) + main_res = [] + attachment_res = [] + + if binary: + msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary)) + else: + msg = BytesParser(policy=policy.default).parse(open(filename, "rb")) + + text_txt, html_txt = [], [] + # get the email header info + for header, value in msg.items(): + text_txt.append(f"{header}: {value}") + + # get the email main info + def _add_content(msg, content_type): + if content_type == "text/plain": + text_txt.append( + msg.get_payload(decode=True).decode(msg.get_content_charset()) + ) + elif content_type == "text/html": + html_txt.append( + msg.get_payload(decode=True).decode(msg.get_content_charset()) + ) + elif "multipart" in content_type: + if msg.is_multipart(): + for part in msg.iter_parts(): + _add_content(part, part.get_content_type()) + + _add_content(msg, msg.get_content_type()) + + sections = TxtParser.parser_txt("\n".join(text_txt)) + [ + (l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l + ] + + st = timer() + chunks = naive_merge( + sections, + int(parser_config.get("chunk_token_num", 128)), + parser_config.get("delimiter", "\n!?。;!?"), + ) + + main_res.extend(tokenize_chunks(chunks, doc, eng, None)) + cron_logger.info("naive_merge({}): {}".format(filename, timer() - st)) + # get the attachment info + for part in msg.iter_attachments(): + content_disposition = part.get("Content-Disposition") + if content_disposition: + dispositions = content_disposition.strip().split(";") + if dispositions[0].lower() == "attachment": + filename = part.get_filename() + payload = part.get_payload(decode=True) + try: + attachment_res.extend( + naive_chunk(filename, payload, callback=callback, **kwargs) + ) + except Exception: + pass + + return main_res + attachment_res + + +if __name__ == "__main__": + import sys + + def dummy(prog=None, msg=""): + pass + + chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/naive.py b/rag/app/naive.py index ab824bfab..b4cfd4015 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -17,7 +17,7 @@ from timeit import default_timer as timer import re from deepdoc.parser.pdf_parser import PlainParser from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx -from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser +from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser from rag.settings import cron_logger from rag.utils import num_tokens_from_string from PIL import Image @@ -170,6 +170,7 @@ class Markdown(MarkdownParser): return sections, tbls + def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ @@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - txt = "" - if binary: - encoding = find_codec(binary) - txt = binary.decode(encoding, errors="ignore") - else: - with open(filename, "r") as f: - while True: - l = f.readline() - if not l: - break - txt += l - sections = [] - for sec in txt.split("\n"): - if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)): - sections.append((sec[:int(len(sec)/2)], "")) - sections.append((sec[int(len(sec)/2):], "")) - else: - sections.append((sec, "")) - + sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128)) callback(0.8, "Finish parsing.") elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index e391d3a58..800633b90 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer from io import BytesIO import pandas as pd -from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph +from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email from api.db import LLMType, ParserType from api.db.services.document_service import DocumentService @@ -69,6 +69,7 @@ FACTORY = { ParserType.PICTURE.value: picture, ParserType.ONE.value: one, ParserType.AUDIO.value: audio, + ParserType.EMAIL.value: email, ParserType.KG.value: knowledge_graph } diff --git a/web/src/components/chunk-method-modal/hooks.ts b/web/src/components/chunk-method-modal/hooks.ts index 788af2ea3..9b76d1bcc 100644 --- a/web/src/components/chunk-method-modal/hooks.ts +++ b/web/src/components/chunk-method-modal/hooks.ts @@ -27,7 +27,7 @@ const ParserListMap = new Map([ 'one', 'qa', 'manual', - 'knowledge_graph', + 'knowledge_graph' ], ], [ @@ -67,6 +67,7 @@ const ParserListMap = new Map([ ], [['md'], ['naive', 'qa', 'knowledge_graph']], [['json'], ['naive', 'knowledge_graph']], + [['eml'], ['email']] ]); const getParserList = (