mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-15 22:05:56 +08:00
Refactor trie load and construct (#4083)
### What problem does this PR solve? 1. Fix initial build and load trie 2. Update comment ### Type of change - [x] Refactoring Signed-off-by: jinhai <haijin.chn@gmail.com>
This commit is contained in:
parent
f8cef73244
commit
50c2b9d562
@ -36,7 +36,7 @@ class RagTokenizer:
|
|||||||
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
|
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
|
||||||
|
|
||||||
def loadDict_(self, fnm):
|
def loadDict_(self, fnm):
|
||||||
logging.info(f"[HUQIE]:Build trie {fnm}")
|
logging.info(f"[HUQIE]:Build trie from {fnm}")
|
||||||
try:
|
try:
|
||||||
of = open(fnm, "r", encoding='utf-8')
|
of = open(fnm, "r", encoding='utf-8')
|
||||||
while True:
|
while True:
|
||||||
@ -50,7 +50,10 @@ class RagTokenizer:
|
|||||||
if k not in self.trie_ or self.trie_[k][0] < F:
|
if k not in self.trie_ or self.trie_[k][0] < F:
|
||||||
self.trie_[self.key_(line[0])] = (F, line[2])
|
self.trie_[self.key_(line[0])] = (F, line[2])
|
||||||
self.trie_[self.rkey_(line[0])] = 1
|
self.trie_[self.rkey_(line[0])] = 1
|
||||||
self.trie_.save(fnm + ".trie")
|
|
||||||
|
dict_file_cache = fnm + ".trie"
|
||||||
|
logging.info(f"[HUQIE]:Build trie cache to {dict_file_cache}")
|
||||||
|
self.trie_.save(dict_file_cache)
|
||||||
of.close()
|
of.close()
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.exception(f"[HUQIE]:Build trie {fnm} failed")
|
logging.exception(f"[HUQIE]:Build trie {fnm} failed")
|
||||||
@ -58,20 +61,30 @@ class RagTokenizer:
|
|||||||
def __init__(self, debug=False):
|
def __init__(self, debug=False):
|
||||||
self.DEBUG = debug
|
self.DEBUG = debug
|
||||||
self.DENOMINATOR = 1000000
|
self.DENOMINATOR = 1000000
|
||||||
self.trie_ = datrie.Trie(string.printable)
|
|
||||||
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
|
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
|
||||||
|
|
||||||
self.stemmer = PorterStemmer()
|
self.stemmer = PorterStemmer()
|
||||||
self.lemmatizer = WordNetLemmatizer()
|
self.lemmatizer = WordNetLemmatizer()
|
||||||
|
|
||||||
self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
|
self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
|
||||||
try:
|
|
||||||
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
|
trie_file_name = self.DIR_ + ".txt.trie"
|
||||||
return
|
# check if trie file existence
|
||||||
except Exception:
|
if os.path.exists(trie_file_name):
|
||||||
logging.exception("[HUQIE]:Build default trie")
|
try:
|
||||||
|
# load trie from file
|
||||||
|
self.trie_ = datrie.Trie.load(trie_file_name)
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
# fail to load trie from file, build default trie
|
||||||
|
logging.exception(f"[HUQIE]:Fail to load trie file {trie_file_name}, build the default trie file")
|
||||||
|
self.trie_ = datrie.Trie(string.printable)
|
||||||
|
else:
|
||||||
|
# file not exist, build default trie
|
||||||
|
logging.info(f"[HUQIE]:Trie file {trie_file_name} not found, build the default trie file")
|
||||||
self.trie_ = datrie.Trie(string.printable)
|
self.trie_ = datrie.Trie(string.printable)
|
||||||
|
|
||||||
|
# load data from dict file and save to trie file
|
||||||
self.loadDict_(self.DIR_ + ".txt")
|
self.loadDict_(self.DIR_ + ".txt")
|
||||||
|
|
||||||
def loadUserDict(self, fnm):
|
def loadUserDict(self, fnm):
|
||||||
@ -86,7 +99,7 @@ class RagTokenizer:
|
|||||||
self.loadDict_(fnm)
|
self.loadDict_(fnm)
|
||||||
|
|
||||||
def _strQ2B(self, ustring):
|
def _strQ2B(self, ustring):
|
||||||
"""把字符串全角转半角"""
|
"""Convert full-width characters to half-width characters"""
|
||||||
rstring = ""
|
rstring = ""
|
||||||
for uchar in ustring:
|
for uchar in ustring:
|
||||||
inside_code = ord(uchar)
|
inside_code = ord(uchar)
|
||||||
@ -94,7 +107,7 @@ class RagTokenizer:
|
|||||||
inside_code = 0x0020
|
inside_code = 0x0020
|
||||||
else:
|
else:
|
||||||
inside_code -= 0xfee0
|
inside_code -= 0xfee0
|
||||||
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
|
if inside_code < 0x0020 or inside_code > 0x7e: # After the conversion, if it's not a half-width character, return the original character.
|
||||||
rstring += uchar
|
rstring += uchar
|
||||||
else:
|
else:
|
||||||
rstring += chr(inside_code)
|
rstring += chr(inside_code)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user