add sql to naive parser (#1908)

### What problem does this PR solve?


### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu 2024-08-12 15:29:33 +08:00 committed by GitHub
parent cd861e3653
commit cafdee536f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 14 additions and 8 deletions

View File

@ -156,7 +156,7 @@ def filename_type(filename):
return FileType.PDF.value return FileType.PDF.value
if re.match( if re.match(
r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html)$", filename): r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
return FileType.DOC.value return FileType.DOC.value
if re.match( if re.match(

View File

@ -12,6 +12,7 @@
# #
from rag.nlp import find_codec,num_tokens_from_string from rag.nlp import find_codec,num_tokens_from_string
import re
class RAGFlowTxtParser: class RAGFlowTxtParser:
def __call__(self, fnm, binary=None, chunk_token_num=128): def __call__(self, fnm, binary=None, chunk_token_num=128):
@ -29,14 +30,17 @@ class RAGFlowTxtParser:
return self.parser_txt(txt, chunk_token_num) return self.parser_txt(txt, chunk_token_num)
@classmethod @classmethod
def parser_txt(cls, txt, chunk_token_num=128): def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if type(txt) != str: if type(txt) != str:
raise TypeError("txt type should be str!") raise TypeError("txt type should be str!")
sections = [] sections = []
for sec in txt.split("\n"): for sec in re.split(r"[%s]+"%delimiter, txt):
if sections and sec in delimiter:
sections[-1][0] += sec
continue
if num_tokens_from_string(sec) > 10 * int(chunk_token_num): if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
sections.append((sec[: int(len(sec) / 2)], "")) sections.append([sec[: int(len(sec) / 2)], ""])
sections.append((sec[int(len(sec) / 2) :], "")) sections.append([sec[int(len(sec) / 2) :], ""])
else: else:
sections.append((sec, "")) sections.append([sec, ""])
return sections return sections

View File

@ -224,9 +224,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
excel_parser = ExcelParser() excel_parser = ExcelParser()
sections = [(l, "") for l in excel_parser.html(binary) if l] sections = [(l, "") for l in excel_parser.html(binary) if l]
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections = TxtParser()(filename,binary,parser_config.get("chunk_token_num", 128)) sections = TxtParser()(filename,binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):