mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 05:19:05 +08:00
add support for eml file parser (#1768)
### What problem does this PR solve? add support for eml file parser #1363 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
parent
b67484e77d
commit
ede733e130
@ -39,7 +39,7 @@ from api.utils import get_uuid
|
|||||||
from api.utils.api_utils import construct_json_result, construct_error_response
|
from api.utils.api_utils import construct_json_result, construct_error_response
|
||||||
from api.utils.api_utils import construct_result, validate_request
|
from api.utils.api_utils import construct_result, validate_request
|
||||||
from api.utils.file_utils import filename_type, thumbnail
|
from api.utils.file_utils import filename_type, thumbnail
|
||||||
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
|
from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email
|
||||||
from rag.nlp import search
|
from rag.nlp import search
|
||||||
from rag.utils.es_conn import ELASTICSEARCH
|
from rag.utils.es_conn import ELASTICSEARCH
|
||||||
from rag.utils.minio_conn import MINIO
|
from rag.utils.minio_conn import MINIO
|
||||||
@ -652,6 +652,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
|
|||||||
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
||||||
case "audio":
|
case "audio":
|
||||||
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
||||||
|
case "email":
|
||||||
|
email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
|
||||||
case _:
|
case _:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -85,6 +85,7 @@ class ParserType(StrEnum):
|
|||||||
PICTURE = "picture"
|
PICTURE = "picture"
|
||||||
ONE = "one"
|
ONE = "one"
|
||||||
AUDIO = "audio"
|
AUDIO = "audio"
|
||||||
|
EMAIL = "email"
|
||||||
KG = "knowledge_graph"
|
KG = "knowledge_graph"
|
||||||
|
|
||||||
|
|
||||||
|
@ -122,7 +122,7 @@ def init_llm_factory():
|
|||||||
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
LLMService.filter_delete([LLMService.model.fid == "QAnything"])
|
||||||
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
|
||||||
TenantService.filter_update([1 == 1], {
|
TenantService.filter_update([1 == 1], {
|
||||||
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph"})
|
"parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email"})
|
||||||
## insert openai two embedding models to the current openai user.
|
## insert openai two embedding models to the current openai user.
|
||||||
print("Start to insert 2 OpenAI embedding models...")
|
print("Start to insert 2 OpenAI embedding models...")
|
||||||
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
|
||||||
|
@ -132,7 +132,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
|
|||||||
API_KEY = LLM.get("api_key", "")
|
API_KEY = LLM.get("api_key", "")
|
||||||
PARSERS = LLM.get(
|
PARSERS = LLM.get(
|
||||||
"parsers",
|
"parsers",
|
||||||
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph")
|
"naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio,knowledge_graph:Knowledge Graph,email:Email")
|
||||||
|
|
||||||
# distribution
|
# distribution
|
||||||
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
|
||||||
|
@ -156,7 +156,7 @@ def filename_type(filename):
|
|||||||
return FileType.PDF.value
|
return FileType.PDF.value
|
||||||
|
|
||||||
if re.match(
|
if re.match(
|
||||||
r".*\.(doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename):
|
||||||
return FileType.DOC.value
|
return FileType.DOC.value
|
||||||
|
|
||||||
if re.match(
|
if re.match(
|
||||||
|
@ -18,3 +18,4 @@ from .ppt_parser import RAGFlowPptParser as PptParser
|
|||||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||||
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
||||||
|
from .txt_parser import RAGFlowTxtParser as TxtParser
|
@ -30,10 +30,15 @@ class RAGFlowHtmlParser:
|
|||||||
else:
|
else:
|
||||||
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
with open(fnm, "r",encoding=get_encoding(fnm)) as f:
|
||||||
txt = f.read()
|
txt = f.read()
|
||||||
|
return self.parser_txt(txt)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parser_txt(cls, txt):
|
||||||
|
if type(txt) != str:
|
||||||
|
raise TypeError("txt type should be str!")
|
||||||
html_doc = readability.Document(txt)
|
html_doc = readability.Document(txt)
|
||||||
title = html_doc.title()
|
title = html_doc.title()
|
||||||
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
content = html_text.extract_text(html_doc.summary(html_partial=True))
|
||||||
txt = f'{title}\n{content}'
|
txt = f"{title}\n{content}"
|
||||||
sections = txt.split("\n")
|
sections = txt.split("\n")
|
||||||
return sections
|
return sections
|
||||||
|
42
deepdoc/parser/txt_parser.py
Normal file
42
deepdoc/parser/txt_parser.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from rag.nlp import find_codec,num_tokens_from_string
|
||||||
|
|
||||||
|
class RAGFlowTxtParser:
|
||||||
|
def __call__(self, fnm, binary=None, chunk_token_num=128):
|
||||||
|
txt = ""
|
||||||
|
if binary:
|
||||||
|
encoding = find_codec(binary)
|
||||||
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
|
else:
|
||||||
|
with open(fnm, "r") as f:
|
||||||
|
while True:
|
||||||
|
l = f.readline()
|
||||||
|
if not l:
|
||||||
|
break
|
||||||
|
txt += l
|
||||||
|
return self.parser_txt(txt, chunk_token_num)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parser_txt(cls, txt, chunk_token_num=128):
|
||||||
|
if type(txt) != str:
|
||||||
|
raise TypeError("txt type should be str!")
|
||||||
|
sections = []
|
||||||
|
for sec in txt.split("\n"):
|
||||||
|
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
|
||||||
|
sections.append((sec[: int(len(sec) / 2)], ""))
|
||||||
|
sections.append((sec[int(len(sec) / 2) :], ""))
|
||||||
|
else:
|
||||||
|
sections.append((sec, ""))
|
||||||
|
return sections
|
114
rag/app/email.py
Normal file
114
rag/app/email.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
from email import policy
|
||||||
|
from email.parser import BytesParser
|
||||||
|
from rag.app.naive import chunk as naive_chunk
|
||||||
|
import re
|
||||||
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
||||||
|
from deepdoc.parser import HtmlParser, TxtParser
|
||||||
|
from timeit import default_timer as timer
|
||||||
|
from rag.settings import cron_logger
|
||||||
|
import io
|
||||||
|
|
||||||
|
|
||||||
|
def chunk(
|
||||||
|
filename,
|
||||||
|
binary=None,
|
||||||
|
from_page=0,
|
||||||
|
to_page=100000,
|
||||||
|
lang="Chinese",
|
||||||
|
callback=None,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Only eml is supported
|
||||||
|
"""
|
||||||
|
eng = lang.lower() == "english" # is_english(cks)
|
||||||
|
parser_config = kwargs.get(
|
||||||
|
"parser_config",
|
||||||
|
{"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True},
|
||||||
|
)
|
||||||
|
doc = {
|
||||||
|
"docnm_kwd": filename,
|
||||||
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
||||||
|
}
|
||||||
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
|
main_res = []
|
||||||
|
attachment_res = []
|
||||||
|
|
||||||
|
if binary:
|
||||||
|
msg = BytesParser(policy=policy.default).parse(io.BytesIO(binary))
|
||||||
|
else:
|
||||||
|
msg = BytesParser(policy=policy.default).parse(open(filename, "rb"))
|
||||||
|
|
||||||
|
text_txt, html_txt = [], []
|
||||||
|
# get the email header info
|
||||||
|
for header, value in msg.items():
|
||||||
|
text_txt.append(f"{header}: {value}")
|
||||||
|
|
||||||
|
# get the email main info
|
||||||
|
def _add_content(msg, content_type):
|
||||||
|
if content_type == "text/plain":
|
||||||
|
text_txt.append(
|
||||||
|
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
||||||
|
)
|
||||||
|
elif content_type == "text/html":
|
||||||
|
html_txt.append(
|
||||||
|
msg.get_payload(decode=True).decode(msg.get_content_charset())
|
||||||
|
)
|
||||||
|
elif "multipart" in content_type:
|
||||||
|
if msg.is_multipart():
|
||||||
|
for part in msg.iter_parts():
|
||||||
|
_add_content(part, part.get_content_type())
|
||||||
|
|
||||||
|
_add_content(msg, msg.get_content_type())
|
||||||
|
|
||||||
|
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
|
||||||
|
(l, "") for l in HtmlParser.parser_txt("\n".join(html_txt)) if l
|
||||||
|
]
|
||||||
|
|
||||||
|
st = timer()
|
||||||
|
chunks = naive_merge(
|
||||||
|
sections,
|
||||||
|
int(parser_config.get("chunk_token_num", 128)),
|
||||||
|
parser_config.get("delimiter", "\n!?。;!?"),
|
||||||
|
)
|
||||||
|
|
||||||
|
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
||||||
|
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
|
# get the attachment info
|
||||||
|
for part in msg.iter_attachments():
|
||||||
|
content_disposition = part.get("Content-Disposition")
|
||||||
|
if content_disposition:
|
||||||
|
dispositions = content_disposition.strip().split(";")
|
||||||
|
if dispositions[0].lower() == "attachment":
|
||||||
|
filename = part.get_filename()
|
||||||
|
payload = part.get_payload(decode=True)
|
||||||
|
try:
|
||||||
|
attachment_res.extend(
|
||||||
|
naive_chunk(filename, payload, callback=callback, **kwargs)
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return main_res + attachment_res
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def dummy(prog=None, msg=""):
|
||||||
|
pass
|
||||||
|
|
||||||
|
chunk(sys.argv[1], callback=dummy)
|
@ -17,7 +17,7 @@ from timeit import default_timer as timer
|
|||||||
import re
|
import re
|
||||||
from deepdoc.parser.pdf_parser import PlainParser
|
from deepdoc.parser.pdf_parser import PlainParser
|
||||||
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec, concat_img, naive_merge_docx, tokenize_chunks_docx
|
||||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser
|
from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser, MarkdownParser, TxtParser
|
||||||
from rag.settings import cron_logger
|
from rag.settings import cron_logger
|
||||||
from rag.utils import num_tokens_from_string
|
from rag.utils import num_tokens_from_string
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
@ -170,6 +170,7 @@ class Markdown(MarkdownParser):
|
|||||||
return sections, tbls
|
return sections, tbls
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||||
lang="Chinese", callback=None, **kwargs):
|
lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
@ -222,25 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
txt = ""
|
sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128))
|
||||||
if binary:
|
|
||||||
encoding = find_codec(binary)
|
|
||||||
txt = binary.decode(encoding, errors="ignore")
|
|
||||||
else:
|
|
||||||
with open(filename, "r") as f:
|
|
||||||
while True:
|
|
||||||
l = f.readline()
|
|
||||||
if not l:
|
|
||||||
break
|
|
||||||
txt += l
|
|
||||||
sections = []
|
|
||||||
for sec in txt.split("\n"):
|
|
||||||
if num_tokens_from_string(sec) > 10 * int(parser_config.get("chunk_token_num", 128)):
|
|
||||||
sections.append((sec[:int(len(sec)/2)], ""))
|
|
||||||
sections.append((sec[int(len(sec)/2):], ""))
|
|
||||||
else:
|
|
||||||
sections.append((sec, ""))
|
|
||||||
|
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||||
|
@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
|
|||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph
|
from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
|
||||||
|
|
||||||
from api.db import LLMType, ParserType
|
from api.db import LLMType, ParserType
|
||||||
from api.db.services.document_service import DocumentService
|
from api.db.services.document_service import DocumentService
|
||||||
@ -69,6 +69,7 @@ FACTORY = {
|
|||||||
ParserType.PICTURE.value: picture,
|
ParserType.PICTURE.value: picture,
|
||||||
ParserType.ONE.value: one,
|
ParserType.ONE.value: one,
|
||||||
ParserType.AUDIO.value: audio,
|
ParserType.AUDIO.value: audio,
|
||||||
|
ParserType.EMAIL.value: email,
|
||||||
ParserType.KG.value: knowledge_graph
|
ParserType.KG.value: knowledge_graph
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ const ParserListMap = new Map([
|
|||||||
'one',
|
'one',
|
||||||
'qa',
|
'qa',
|
||||||
'manual',
|
'manual',
|
||||||
'knowledge_graph',
|
'knowledge_graph'
|
||||||
],
|
],
|
||||||
],
|
],
|
||||||
[
|
[
|
||||||
@ -67,6 +67,7 @@ const ParserListMap = new Map([
|
|||||||
],
|
],
|
||||||
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
[['md'], ['naive', 'qa', 'knowledge_graph']],
|
||||||
[['json'], ['naive', 'knowledge_graph']],
|
[['json'], ['naive', 'knowledge_graph']],
|
||||||
|
[['eml'], ['email']]
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const getParserList = (
|
const getParserList = (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user