Refactor trie load and construct (#4083)

### What problem does this PR solve?

1. Fix initial build and load trie
2. Update comment

### Type of change

- [x] Refactoring

Signed-off-by: jinhai <haijin.chn@gmail.com>
This commit is contained in:
Jin Hai 2024-12-18 12:52:56 +08:00 committed by GitHub
parent f8cef73244
commit 50c2b9d562
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -36,7 +36,7 @@ class RagTokenizer:
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1] return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
def loadDict_(self, fnm): def loadDict_(self, fnm):
logging.info(f"[HUQIE]:Build trie {fnm}") logging.info(f"[HUQIE]:Build trie from {fnm}")
try: try:
of = open(fnm, "r", encoding='utf-8') of = open(fnm, "r", encoding='utf-8')
while True: while True:
@ -50,7 +50,10 @@ class RagTokenizer:
if k not in self.trie_ or self.trie_[k][0] < F: if k not in self.trie_ or self.trie_[k][0] < F:
self.trie_[self.key_(line[0])] = (F, line[2]) self.trie_[self.key_(line[0])] = (F, line[2])
self.trie_[self.rkey_(line[0])] = 1 self.trie_[self.rkey_(line[0])] = 1
self.trie_.save(fnm + ".trie")
dict_file_cache = fnm + ".trie"
logging.info(f"[HUQIE]:Build trie cache to {dict_file_cache}")
self.trie_.save(dict_file_cache)
of.close() of.close()
except Exception: except Exception:
logging.exception(f"[HUQIE]:Build trie {fnm} failed") logging.exception(f"[HUQIE]:Build trie {fnm} failed")
@ -58,20 +61,30 @@ class RagTokenizer:
def __init__(self, debug=False): def __init__(self, debug=False):
self.DEBUG = debug self.DEBUG = debug
self.DENOMINATOR = 1000000 self.DENOMINATOR = 1000000
self.trie_ = datrie.Trie(string.printable)
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie") self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
self.stemmer = PorterStemmer() self.stemmer = PorterStemmer()
self.lemmatizer = WordNetLemmatizer() self.lemmatizer = WordNetLemmatizer()
self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)" self.SPLIT_CHAR = r"([ ,\.<>/?;:'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
try:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie") trie_file_name = self.DIR_ + ".txt.trie"
return # check if trie file existence
except Exception: if os.path.exists(trie_file_name):
logging.exception("[HUQIE]:Build default trie") try:
# load trie from file
self.trie_ = datrie.Trie.load(trie_file_name)
return
except Exception:
# fail to load trie from file, build default trie
logging.exception(f"[HUQIE]:Fail to load trie file {trie_file_name}, build the default trie file")
self.trie_ = datrie.Trie(string.printable)
else:
# file not exist, build default trie
logging.info(f"[HUQIE]:Trie file {trie_file_name} not found, build the default trie file")
self.trie_ = datrie.Trie(string.printable) self.trie_ = datrie.Trie(string.printable)
# load data from dict file and save to trie file
self.loadDict_(self.DIR_ + ".txt") self.loadDict_(self.DIR_ + ".txt")
def loadUserDict(self, fnm): def loadUserDict(self, fnm):
@ -86,7 +99,7 @@ class RagTokenizer:
self.loadDict_(fnm) self.loadDict_(fnm)
def _strQ2B(self, ustring): def _strQ2B(self, ustring):
"""把字符串全角转半角""" """Convert full-width characters to half-width characters"""
rstring = "" rstring = ""
for uchar in ustring: for uchar in ustring:
inside_code = ord(uchar) inside_code = ord(uchar)
@ -94,7 +107,7 @@ class RagTokenizer:
inside_code = 0x0020 inside_code = 0x0020
else: else:
inside_code -= 0xfee0 inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符 if inside_code < 0x0020 or inside_code > 0x7e: # After the conversion, if it's not a half-width character, return the original character.
rstring += uchar rstring += uchar
else: else:
rstring += chr(inside_code) rstring += chr(inside_code)