refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
This commit is contained in:
KevinHuSh 2024-04-28 19:13:33 +08:00 committed by GitHub
parent aee8b48d2f
commit 8c07992b6c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 538 additions and 116 deletions

View File

@ -20,7 +20,7 @@ from flask_login import login_required, current_user
from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from rag.app.qa import rmPrefix, beAdoc from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import search, huqie from rag.nlp import search, rag_tokenizer
from rag.utils.es_conn import ELASTICSEARCH from rag.utils.es_conn import ELASTICSEARCH
from rag.utils import rmSpace from rag.utils import rmSpace
from api.db import LLMType, ParserType from api.db import LLMType, ParserType
@ -125,10 +125,10 @@ def set():
d = { d = {
"id": req["chunk_id"], "id": req["chunk_id"],
"content_with_weight": req["content_with_weight"]} "content_with_weight": req["content_with_weight"]}
d["content_ltks"] = huqie.qie(req["content_with_weight"]) d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"])
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req["important_kwd"] d["important_kwd"] = req["important_kwd"]
d["important_tks"] = huqie.qie(" ".join(req["important_kwd"])) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"]))
if "available_int" in req: if "available_int" in req:
d["available_int"] = req["available_int"] d["available_int"] = req["available_int"]
@ -152,7 +152,7 @@ def set():
retmsg="Q&A must be separated by TAB/ENTER key.") retmsg="Q&A must be separated by TAB/ENTER key.")
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
d = beAdoc(d, arr[0], arr[1], not any( d = beAdoc(d, arr[0], arr[1], not any(
[huqie.is_chinese(t) for t in q + a])) [rag_tokenizer.is_chinese(t) for t in q + a]))
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
@ -202,11 +202,11 @@ def create():
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
chunck_id = md5.hexdigest() chunck_id = md5.hexdigest()
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]} "content_with_weight": req["content_with_weight"]}
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
d["important_kwd"] = req.get("important_kwd", []) d["important_kwd"] = req.get("important_kwd", [])
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", [])))
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp() d["create_timestamp_flt"] = datetime.datetime.now().timestamp()

View File

@ -78,14 +78,13 @@ class TaskService(CommonService):
docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \
.join(Document, on=(cls.model.doc_id == Document.id)) \ .join(Document, on=(cls.model.doc_id == Document.id)) \
.join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \
.join(File, on=(File2Document.file_id == File.id)) \ .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \
.where( .where(
Document.status == StatusEnum.VALID.value, Document.status == StatusEnum.VALID.value,
Document.run == TaskStatus.RUNNING.value, Document.run == TaskStatus.RUNNING.value,
~(Document.type == FileType.VIRTUAL.value), ~(Document.type == FileType.VIRTUAL.value),
cls.model.progress >= 0,
cls.model.progress < 1, cls.model.progress < 1,
cls.model.create_time >= current_timestamp() - 180000 cls.model.create_time >= current_timestamp() - 1000 * 600
) )
docs = list(docs.dicts()) docs = list(docs.dicts())
if not docs: return [] if not docs: return []

View File

@ -3,7 +3,7 @@ from docx import Document
import re import re
import pandas as pd import pandas as pd
from collections import Counter from collections import Counter
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from io import BytesIO from io import BytesIO
@ -35,14 +35,14 @@ class RAGFlowDocxParser:
for p, n in patt: for p, n in patt:
if re.search(p, b): if re.search(p, b):
return n return n
tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1] tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1]
if len(tks) > 3: if len(tks) > 3:
if len(tks) < 12: if len(tks) < 12:
return "Tx" return "Tx"
else: else:
return "Lx" return "Lx"
if len(tks) == 1 and huqie.tag(tks[0]) == "nr": if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr" return "Nr"
return "Ot" return "Ot"

View File

@ -16,7 +16,7 @@ from PyPDF2 import PdfReader as pdf2_read
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from copy import deepcopy from copy import deepcopy
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
@ -95,13 +95,13 @@ class RAGFlowPdfParser:
h = max(self.__height(up), self.__height(down)) h = max(self.__height(up), self.__height(down))
y_dis = self._y_dis(up, down) y_dis = self._y_dis(up, down)
LEN = 6 LEN = 6
tks_down = huqie.qie(down["text"][:LEN]).split(" ") tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
tks_up = huqie.qie(up["text"][-LEN:]).split(" ") tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
tks_all = up["text"][-LEN:].strip() \ tks_all = up["text"][-LEN:].strip() \
+ (" " if re.match(r"[a-zA-Z0-9]+", + (" " if re.match(r"[a-zA-Z0-9]+",
up["text"][-1] + down["text"][0]) else "") \ up["text"][-1] + down["text"][0]) else "") \
+ down["text"][:LEN].strip() + down["text"][:LEN].strip()
tks_all = huqie.qie(tks_all).split(" ") tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
fea = [ fea = [
up.get("R", -1) == down.get("R", -1), up.get("R", -1) == down.get("R", -1),
y_dis / h, y_dis / h,
@ -142,8 +142,8 @@ class RAGFlowPdfParser:
tks_down[-1] == tks_up[-1], tks_down[-1] == tks_up[-1],
max(down["in_row"], up["in_row"]), max(down["in_row"], up["in_row"]),
abs(down["in_row"] - up["in_row"]), abs(down["in_row"] - up["in_row"]),
len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0, len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0 len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
] ]
return fea return fea
@ -599,7 +599,7 @@ class RAGFlowPdfParser:
if b["text"].strip()[0] != b_["text"].strip()[0] \ if b["text"].strip()[0] != b_["text"].strip()[0] \
or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
or huqie.is_chinese(b["text"].strip()[0]) \ or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
or b["top"] > b_["bottom"]: or b["top"] > b_["bottom"]:
i += 1 i += 1
continue continue

View File

@ -1,6 +1,6 @@
import re,json,os import re,json,os
import pandas as pd import pandas as pd
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from . import regions from . import regions
current_file_path = os.path.dirname(os.path.abspath(__file__)) current_file_path = os.path.dirname(os.path.abspath(__file__))
GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0) GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0)
@ -22,14 +22,14 @@ def baike(cid, default_v=0):
def corpNorm(nm, add_region=True): def corpNorm(nm, add_region=True):
global CORP_TKS global CORP_TKS
if not nm or type(nm)!=type(""):return "" if not nm or type(nm)!=type(""):return ""
nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower() nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower()
nm = re.sub(r"&amp;", "&", nm) nm = re.sub(r"&amp;", "&", nm)
nm = re.sub(r"[\(\)\+'\"\t \*\\【】-]+", " ", nm) nm = re.sub(r"[\(\)\+'\"\t \*\\【】-]+", " ", nm)
nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE) nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE)
nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE)
if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm
tks = huqie.qie(nm).split(" ") tks = rag_tokenizer.tokenize(nm).split(" ")
reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)]
nm = "" nm = ""
for t in tks: for t in tks:

View File

@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \
traceback, signal traceback, signal
import numpy as np import numpy as np
from deepdoc.parser.resume.entities import degrees, schools, corporations from deepdoc.parser.resume.entities import degrees, schools, corporations
from rag.nlp import huqie, surname from rag.nlp import rag_tokenizer, surname
from xpinyin import Pinyin from xpinyin import Pinyin
from contextlib import contextmanager from contextlib import contextmanager
@ -83,7 +83,7 @@ def forEdu(cv):
if n.get("school_name") and isinstance(n["school_name"], str): if n.get("school_name") and isinstance(n["school_name"], str):
sch.append(re.sub(r"(211|985|重点大学|[,&;-])", "", n["school_name"])) sch.append(re.sub(r"(211|985|重点大学|[,&;-])", "", n["school_name"]))
e["sch_nm_kwd"] = sch[-1] e["sch_nm_kwd"] = sch[-1]
fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1]) fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1])
if n.get("discipline_name") and isinstance(n["discipline_name"], str): if n.get("discipline_name") and isinstance(n["discipline_name"], str):
maj.append(n["discipline_name"]) maj.append(n["discipline_name"])
@ -166,10 +166,10 @@ def forEdu(cv):
if "tag_kwd" not in cv: cv["tag_kwd"] = [] if "tag_kwd" not in cv: cv["tag_kwd"] = []
if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历")
if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj)) if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj))
if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch)) if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch))
if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch)) if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch))
if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj)) if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj))
return cv return cv
@ -187,11 +187,11 @@ def forProj(cv):
if n.get("achivement"): desc.append(str(n["achivement"])) if n.get("achivement"): desc.append(str(n["achivement"]))
if pro_nms: if pro_nms:
# cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms)) # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms))
cv["project_name_tks"] = huqie.qie(pro_nms[0]) cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0])
if desc: if desc:
cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc))) cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc)))
cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0])) cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0]))
return cv return cv
@ -280,25 +280,25 @@ def forWork(cv):
if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"]
if fea["position_name"]: if fea["position_name"]:
cv["position_name_tks"] = huqie.qie(fea["position_name"][0]) cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0])
cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"]) cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"])
cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:])) cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:]))
if fea["industry_name"]: if fea["industry_name"]:
cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0]) cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0])
cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"]) cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"])
cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:])) cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:]))
if fea["corporation_name"]: if fea["corporation_name"]:
cv["corporation_name_kwd"] = fea["corporation_name"][0] cv["corporation_name_kwd"] = fea["corporation_name"][0]
cv["corp_nm_kwd"] = fea["corporation_name"] cv["corp_nm_kwd"] = fea["corporation_name"]
cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0]) cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0])
cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"]) cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"])
cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:])) cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:]))
if fea["responsibilities"]: if fea["responsibilities"]:
cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0]) cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0])
cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:])) cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:]))
if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if
re.match(r"[^0-9]+$", str(i))] re.match(r"[^0-9]+$", str(i))]
@ -444,15 +444,15 @@ def parse(cv):
if nms: if nms:
t = k[:-4] t = k[:-4]
cv[f"{t}_kwd"] = nms cv[f"{t}_kwd"] = nms
cv[f"{t}_tks"] = huqie.qie(" ".join(nms)) cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms))
except Exception as e: except Exception as e:
print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) print("【EXCEPTION】:", str(traceback.format_exc()), cv[k])
cv[k] = [] cv[k] = []
# tokenize fields # tokenize fields
if k in tks_fld: if k in tks_fld:
cv[f"{k}_tks"] = huqie.qie(cv[k]) cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k])
if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"]) if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"])
# keyword fields # keyword fields
if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower()
@ -492,7 +492,7 @@ def parse(cv):
cv["name_kwd"] = name cv["name_kwd"] = name
cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3]
cv["name_tks"] = ( cv["name_tks"] = (
huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "")
) if name else "" ) if name else ""
else: else:
cv["integerity_flt"] /= 2. cv["integerity_flt"] /= 2.
@ -515,7 +515,7 @@ def parse(cv):
cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d))
# long text tokenize # long text tokenize
if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"])) if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"]))
# for yes or no field # for yes or no field
fea = [] fea = []

View File

@ -19,7 +19,7 @@ import numpy as np
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from .recognizer import Recognizer from .recognizer import Recognizer
@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer):
for p, n in patt: for p, n in patt:
if re.search(p, b["text"].strip()): if re.search(p, b["text"].strip()):
return n return n
tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1] tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1]
if len(tks) > 3: if len(tks) > 3:
if len(tks) < 12: if len(tks) < 12:
return "Tx" return "Tx"
else: else:
return "Lx" return "Lx"
if len(tks) == 1 and huqie.tag(tks[0]) == "nr": if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
return "Nr" return "Nr"
return "Ot" return "Ot"

View File

@ -18,7 +18,7 @@ from io import BytesIO
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
tokenize_chunks, find_codec tokenize_chunks, find_codec
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser from deepdoc.parser import PdfParser, DocxParser, PlainParser
@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
""" """
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None pdf_parser = None
sections, tbls = [], [] sections, tbls = [], []
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):

View File

@ -19,7 +19,7 @@ from docx import Document
from api.db import ParserType from api.db import ParserType
from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \
make_colon_as_title, add_positions, tokenize_chunks, find_codec make_colon_as_title, add_positions, tokenize_chunks, find_codec
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser from deepdoc.parser import PdfParser, DocxParser, PlainParser
from rag.settings import cron_logger from rag.settings import cron_logger
@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
""" """
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None pdf_parser = None
sections = [] sections = []
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):

View File

@ -2,7 +2,7 @@ import copy
import re import re
from api.db import ParserType from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc = { doc = {
"docnm_kwd": filename "docnm_kwd": filename
} }
doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English # is it English
eng = lang.lower() == "english" # pdf_parser.is_english eng = lang.lower() == "english" # pdf_parser.is_english

View File

@ -16,7 +16,7 @@ from docx import Document
from timeit import default_timer as timer from timeit import default_timer as timer
import re import re
from deepdoc.parser.pdf_parser import PlainParser from deepdoc.parser.pdf_parser import PlainParser
from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from rag.settings import cron_logger from rag.settings import cron_logger
@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True})
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
pdf_parser = None pdf_parser = None
sections = [] sections = []

View File

@ -14,7 +14,7 @@ from tika import parser
from io import BytesIO from io import BytesIO
import re import re
from rag.app import laws from rag.app import laws
from rag.nlp import huqie, tokenize, find_codec from rag.nlp import rag_tokenizer, tokenize, find_codec
from deepdoc.parser import PdfParser, ExcelParser, PlainParser from deepdoc.parser import PdfParser, ExcelParser, PlainParser
@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng) tokenize(doc, "\n".join(sections), eng)
return [doc] return [doc]

View File

@ -15,7 +15,7 @@ import re
from collections import Counter from collections import Counter
from api.db import ParserType from api.db import ParserType
from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser, PlainParser
import numpy as np import numpy as np
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
else: else:
raise NotImplementedError("file type not supported yet(pdf supported)") raise NotImplementedError("file type not supported yet(pdf supported)")
doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]), doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]),
"title_tks": huqie.qie(paper["title"] if paper["title"] else filename)} "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)}
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"]) doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"])
# is it English # is it English
eng = lang.lower() == "english" # pdf_parser.is_english eng = lang.lower() == "english" # pdf_parser.is_english
print("It's English.....", eng) print("It's English.....", eng)

View File

@ -17,7 +17,7 @@ from io import BytesIO
from PIL import Image from PIL import Image
from rag.nlp import tokenize, is_english from rag.nlp import tokenize, is_english
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read from PyPDF2 import PdfReader as pdf2_read
@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
eng = lang.lower() == "english" eng = lang.lower() == "english"
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = [] res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE): if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt() ppt_parser = Ppt()

View File

@ -16,7 +16,7 @@ from io import BytesIO
from nltk import word_tokenize from nltk import word_tokenize
from openpyxl import load_workbook from openpyxl import load_workbook
from rag.nlp import is_english, random_choices, find_codec from rag.nlp import is_english, random_choices, find_codec
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser
@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng):
aprefix = "Answer: " if eng else "回答:" aprefix = "Answer: " if eng else "回答:"
d["content_with_weight"] = "\t".join( d["content_with_weight"] = "\t".join(
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
d["content_ltks"] = huqie.qie(q) d["content_ltks"] = rag_tokenizer.tokenize(q)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
return d return d
@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
res = [] res = []
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
if re.search(r"\.xlsx?$", filename, re.IGNORECASE): if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")

View File

@ -18,7 +18,7 @@ import re
import pandas as pd import pandas as pd
import requests import requests
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from deepdoc.parser.resume import refactor from deepdoc.parser.resume import refactor
from deepdoc.parser.resume import step_one, step_two from deepdoc.parser.resume import step_one, step_two
from rag.settings import cron_logger from rag.settings import cron_logger
@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs):
titles.append(str(v)) titles.append(str(v))
doc = { doc = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie("-".join(titles) + "-简历") "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历")
} }
doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pairs = [] pairs = []
for n, m in field_map.items(): for n, m in field_map.items():
if not resume.get(n): if not resume.get(n):
@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs):
doc["content_with_weight"] = "\n".join( doc["content_with_weight"] = "\n".join(
["{}: {}".format(re.sub(r"[^]+", "", k), v) for k, v in pairs]) ["{}: {}".format(re.sub(r"[^]+", "", k), v) for k, v in pairs])
doc["content_ltks"] = huqie.qie(doc["content_with_weight"]) doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"])
doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"]) doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"])
for n, _ in field_map.items(): for n, _ in field_map.items():
if n not in resume: if n not in resume:
continue continue
@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs):
len(resume[n]) == 1 or n not in forbidden_select_fields4resume): len(resume[n]) == 1 or n not in forbidden_select_fields4resume):
resume[n] = resume[n][0] resume[n] = resume[n][0]
if n.find("_tks") > 0: if n.find("_tks") > 0:
resume[n] = huqie.qieqie(resume[n]) resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n])
doc[n] = resume[n] doc[n] = resume[n]
print(doc) print(doc)

View File

@ -20,7 +20,7 @@ from openpyxl import load_workbook
from dateutil.parser import parse as datetime_parse from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from rag.nlp import huqie, is_english, tokenize, find_codec from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec
from deepdoc.parser import ExcelParser from deepdoc.parser import ExcelParser
@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
for ii, row in df.iterrows(): for ii, row in df.iterrows():
d = { d = {
"docnm_kwd": filename, "docnm_kwd": filename,
"title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
} }
row_txt = [] row_txt = []
for j in range(len(clmns)): for j in range(len(clmns)):
@ -227,7 +227,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
if pd.isna(row[clmns[j]]): if pd.isna(row[clmns[j]]):
continue continue
fld = clmns_map[j][0] fld = clmns_map[j][0]
d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie( d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize(
row[clmns[j]]) row[clmns[j]])
row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) row_txt.append("{}:{}".format(clmns[j], row[clmns[j]]))
if not row_txt: if not row_txt:

View File

@ -2,7 +2,7 @@ import random
from collections import Counter from collections import Counter
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
from . import huqie from . import rag_tokenizer
import re import re
import copy import copy
@ -109,8 +109,8 @@ def is_english(texts):
def tokenize(d, t, eng): def tokenize(d, t, eng):
d["content_with_weight"] = t d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t) t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
d["content_ltks"] = huqie.qie(t) d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser): def tokenize_chunks(chunks, doc, eng, pdf_parser):

View File

@ -7,7 +7,7 @@ import logging
import copy import copy
from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from rag.nlp import huqie, term_weight, synonym from rag.nlp import rag_tokenizer, term_weight, synonym
class EsQueryer: class EsQueryer:
@ -47,13 +47,13 @@ class EsQueryer:
txt = re.sub( txt = re.sub(
r"[ \r\n\t,,。??/`!&]+", r"[ \r\n\t,,。??/`!&]+",
" ", " ",
huqie.tradi2simp( rag_tokenizer.tradi2simp(
huqie.strQ2B( rag_tokenizer.strQ2B(
txt.lower()))).strip() txt.lower()))).strip()
txt = EsQueryer.rmWWW(txt) txt = EsQueryer.rmWWW(txt)
if not self.isChinese(txt): if not self.isChinese(txt):
tks = huqie.qie(txt).split(" ") tks = rag_tokenizer.tokenize(txt).split(" ")
q = copy.deepcopy(tks) q = copy.deepcopy(tks)
for i in range(1, len(tks)): for i in range(1, len(tks)):
q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) q.append("\"%s %s\"^2" % (tks[i - 1], tks[i]))
@ -65,7 +65,7 @@ class EsQueryer:
boost=1)#, minimum_should_match=min_match) boost=1)#, minimum_should_match=min_match)
), tks ), tks
def needQieqie(tk): def need_fine_grained_tokenize(tk):
if len(tk) < 4: if len(tk) < 4:
return False return False
if re.match(r"[0-9a-z\.\+#_\*-]+$", tk): if re.match(r"[0-9a-z\.\+#_\*-]+$", tk):
@ -81,7 +81,7 @@ class EsQueryer:
logging.info(json.dumps(twts, ensure_ascii=False)) logging.info(json.dumps(twts, ensure_ascii=False))
tms = [] tms = []
for tk, w in sorted(twts, key=lambda x: x[1] * -1): for tk, w in sorted(twts, key=lambda x: x[1] * -1):
sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else [] sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else []
sm = [ sm = [
re.sub( re.sub(
r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+", r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+",
@ -110,10 +110,10 @@ class EsQueryer:
if len(twts) > 1: if len(twts) > 1:
tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts])) tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts]))
if re.match(r"[0-9a-z ]+$", tt): if re.match(r"[0-9a-z ]+$", tt):
tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt) tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt)
syns = " OR ".join( syns = " OR ".join(
["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns]) ["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns])
if syns: if syns:
tms = f"({tms})^5 OR ({syns})^0.7" tms = f"({tms})^5 OR ({syns})^0.7"

423
rag/nlp/rag_tokenizer.py Normal file
View File

@ -0,0 +1,423 @@
# -*- coding: utf-8 -*-
import copy
import datrie
import math
import os
import re
import string
import sys
from hanziconv import HanziConv
from huggingface_hub import snapshot_download
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from api.utils.file_utils import get_project_base_directory
class RagTokenizer:
def key_(self, line):
return str(line.lower().encode("utf-8"))[2:-1]
def rkey_(self, line):
return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1]
def loadDict_(self, fnm):
print("[HUQIE]:Build trie", fnm, file=sys.stderr)
try:
of = open(fnm, "r")
while True:
line = of.readline()
if not line:
break
line = re.sub(r"[\r\n]+", "", line)
line = re.split(r"[ \t]", line)
k = self.key_(line[0])
F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5)
if k not in self.trie_ or self.trie_[k][0] < F:
self.trie_[self.key_(line[0])] = (F, line[2])
self.trie_[self.rkey_(line[0])] = 1
self.trie_.save(fnm + ".trie")
of.close()
except Exception as e:
print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr)
def __init__(self, debug=False):
self.DEBUG = debug
self.DENOMINATOR = 1000000
self.trie_ = datrie.Trie(string.printable)
self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie")
self.stemmer = PorterStemmer()
self.lemmatizer = WordNetLemmatizer()
self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)"
try:
self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie")
return
except Exception as e:
print("[HUQIE]:Build default trie", file=sys.stderr)
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(self.DIR_ + ".txt")
def loadUserDict(self, fnm):
try:
self.trie_ = datrie.Trie.load(fnm + ".trie")
return
except Exception as e:
self.trie_ = datrie.Trie(string.printable)
self.loadDict_(fnm)
def addUserDict(self, fnm):
self.loadDict_(fnm)
def _strQ2B(self, ustring):
"""把字符串全角转半角"""
rstring = ""
for uchar in ustring:
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符
rstring += uchar
else:
rstring += chr(inside_code)
return rstring
def _tradi2simp(self, line):
return HanziConv.toSimplified(line)
def dfs_(self, chars, s, preTks, tkslist):
MAX_L = 10
res = s
# if s > MAX_L or s>= len(chars):
if s >= len(chars):
tkslist.append(preTks)
return res
# pruning
S = s + 1
if s + 2 <= len(chars):
t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2])
if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix(
self.key_(t2)):
S = s + 2
if len(preTks) > 2 and len(
preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1:
t1 = preTks[-1][0] + "".join(chars[s:s + 1])
if self.trie_.has_keys_with_prefix(self.key_(t1)):
S = s + 2
################
for e in range(S, len(chars) + 1):
t = "".join(chars[s:e])
k = self.key_(t)
if e > s + 1 and not self.trie_.has_keys_with_prefix(k):
break
if k in self.trie_:
pretks = copy.deepcopy(preTks)
if k in self.trie_:
pretks.append((t, self.trie_[k]))
else:
pretks.append((t, (-12, '')))
res = max(res, self.dfs_(chars, e, pretks, tkslist))
if res > s:
return res
t = "".join(chars[s:s + 1])
k = self.key_(t)
if k in self.trie_:
preTks.append((t, self.trie_[k]))
else:
preTks.append((t, (-12, '')))
return self.dfs_(chars, s + 1, preTks, tkslist)
def freq(self, tk):
k = self.key_(tk)
if k not in self.trie_:
return 0
return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5)
def tag(self, tk):
k = self.key_(tk)
if k not in self.trie_:
return ""
return self.trie_[k][1]
def score_(self, tfts):
B = 30
F, L, tks = 0, 0, []
for tk, (freq, tag) in tfts:
F += freq
L += 0 if len(tk) < 2 else 1
tks.append(tk)
F /= len(tks)
L /= len(tks)
if self.DEBUG:
print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F)
return tks, B / len(tks) + L + F
def sortTks_(self, tkslist):
res = []
for tfts in tkslist:
tks, s = self.score_(tfts)
res.append((tks, s))
return sorted(res, key=lambda x: x[1], reverse=True)
def merge_(self, tks):
patts = [
(r"[ ]+", " "),
(r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"),
]
# for p,s in patts: tks = re.sub(p, s, tks)
# if split chars is part of token
res = []
tks = re.sub(r"[ ]+", " ", tks).split(" ")
s = 0
while True:
if s >= len(tks):
break
E = s + 1
for e in range(s + 2, min(len(tks) + 2, s + 6)):
tk = "".join(tks[s:e])
if re.search(self.SPLIT_CHAR, tk) and self.freq(tk):
E = e
res.append("".join(tks[s:E]))
s = E
return " ".join(res)
def maxForward_(self, line):
res = []
s = 0
while s < len(line):
e = s + 1
t = line[s:e]
while e < len(line) and self.trie_.has_keys_with_prefix(
self.key_(t)):
e += 1
t = line[s:e]
while e - 1 > s and self.key_(t) not in self.trie_:
e -= 1
t = line[s:e]
if self.key_(t) in self.trie_:
res.append((t, self.trie_[self.key_(t)]))
else:
res.append((t, (0, '')))
s = e
return self.score_(res)
def maxBackward_(self, line):
res = []
s = len(line) - 1
while s >= 0:
e = s + 1
t = line[s:e]
while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)):
s -= 1
t = line[s:e]
while s + 1 < e and self.key_(t) not in self.trie_:
s += 1
t = line[s:e]
if self.key_(t) in self.trie_:
res.append((t, self.trie_[self.key_(t)]))
else:
res.append((t, (0, '')))
s -= 1
return self.score_(res[::-1])
def tokenize(self, line):
line = self._strQ2B(line).lower()
line = self._tradi2simp(line)
zh_num = len([1 for c in line if is_chinese(c)])
if zh_num < len(line) * 0.2:
return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)])
arr = re.split(self.SPLIT_CHAR, line)
res = []
for L in arr:
if len(L) < 2 or re.match(
r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L):
res.append(L)
continue
# print(L)
# use maxforward for the first time
tks, s = self.maxForward_(L)
tks1, s1 = self.maxBackward_(L)
if self.DEBUG:
print("[FW]", tks, s)
print("[BW]", tks1, s1)
diff = [0 for _ in range(max(len(tks1), len(tks)))]
for i in range(min(len(tks1), len(tks))):
if tks[i] != tks1[i]:
diff[i] = 1
if s1 > s:
tks = tks1
i = 0
while i < len(tks):
s = i
while s < len(tks) and diff[s] == 0:
s += 1
if s == len(tks):
res.append(" ".join(tks[i:]))
break
if s > i:
res.append(" ".join(tks[i:s]))
e = s
while e < len(tks) and e - s < 5 and diff[e] == 1:
e += 1
tkslist = []
self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist)
res.append(" ".join(self.sortTks_(tkslist)[0][0]))
i = e + 1
res = " ".join(res)
if self.DEBUG:
print("[TKS]", self.merge_(res))
return self.merge_(res)
def fine_grained_tokenize(self, tks):
tks = tks.split(" ")
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
if zh_num < len(tks) * 0.2:
res = []
for tk in tks:
res.extend(tk.split("/"))
return " ".join(res)
res = []
for tk in tks:
if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk):
res.append(tk)
continue
tkslist = []
if len(tk) > 10:
tkslist.append(tk)
else:
self.dfs_(tk, 0, [], tkslist)
if len(tkslist) < 2:
res.append(tk)
continue
stk = self.sortTks_(tkslist)[1][0]
if len(stk) == len(tk):
stk = tk
else:
if re.match(r"[a-z\.-]+$", tk):
for t in stk:
if len(t) < 3:
stk = tk
break
else:
stk = " ".join(stk)
else:
stk = " ".join(stk)
res.append(stk)
return " ".join(res)
def is_chinese(s):
if s >= u'\u4e00' and s <= u'\u9fa5':
return True
else:
return False
def is_number(s):
if s >= u'\u0030' and s <= u'\u0039':
return True
else:
return False
def is_alphabet(s):
if (s >= u'\u0041' and s <= u'\u005a') or (
s >= u'\u0061' and s <= u'\u007a'):
return True
else:
return False
def naiveQie(txt):
tks = []
for t in txt.split(" "):
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
) and re.match(r".*[a-zA-Z]$", t):
tks.append(" ")
tks.append(t)
return tks
tokenizer = RagTokenizer()
tokenize = tokenizer.tokenize
fine_grained_tokenize = tokenizer.fine_grained_tokenize
tag = tokenizer.tag
freq = tokenizer.freq
loadUserDict = tokenizer.loadUserDict
addUserDict = tokenizer.addUserDict
tradi2simp = tokenizer._tradi2simp
strQ2B = tokenizer._strQ2B
if __name__ == '__main__':
tknzr = RagTokenizer(debug=True)
# huqie.addUserDict("/tmp/tmp.new.tks.dict")
tks = tknzr.tokenize(
"哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("虽然我不怎么玩")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ")
print(tknzr.fine_grained_tokenize(tks))
tks = tknzr.tokenize(
"数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-")
print(tknzr.fine_grained_tokenize(tks))
if len(sys.argv) < 2:
sys.exit()
tknzr.DEBUG = False
tknzr.loadUserDict(sys.argv[1])
of = open(sys.argv[2], "r")
while True:
line = of.readline()
if not line:
break
print(tknzr.tokenize(line))
of.close()

View File

@ -9,7 +9,7 @@ from dataclasses import dataclass
from rag.settings import es_logger from rag.settings import es_logger
from rag.utils import rmSpace from rag.utils import rmSpace
from rag.nlp import huqie, query from rag.nlp import rag_tokenizer, query
import numpy as np import numpy as np
@ -128,7 +128,7 @@ class Dealer:
kwds = set([]) kwds = set([])
for k in keywords: for k in keywords:
kwds.add(k) kwds.add(k)
for kk in huqie.qieqie(k).split(" "): for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
if len(kk) < 2: if len(kk) < 2:
continue continue
if kk in kwds: if kk in kwds:
@ -243,7 +243,7 @@ class Dealer:
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
len(ans_v[0]), len(chunk_v[0])) len(ans_v[0]), len(chunk_v[0]))
chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
for ck in chunks] for ck in chunks]
cites = {} cites = {}
thr = 0.63 thr = 0.63
@ -251,7 +251,7 @@ class Dealer:
for i, a in enumerate(pieces_): for i, a in enumerate(pieces_):
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
chunk_v, chunk_v,
huqie.qie( rag_tokenizer.tokenize(
self.qryr.rmWWW(pieces_[i])).split(" "), self.qryr.rmWWW(pieces_[i])).split(" "),
chunks_tks, chunks_tks,
tkweight, vtweight) tkweight, vtweight)
@ -310,8 +310,8 @@ class Dealer:
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst): def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
return self.qryr.hybrid_similarity(ans_embd, return self.qryr.hybrid_similarity(ans_embd,
ins_embd, ins_embd,
huqie.qie(ans).split(" "), rag_tokenizer.tokenize(ans).split(" "),
huqie.qie(inst).split(" ")) rag_tokenizer.tokenize(inst).split(" "))
def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2, def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2,
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True): vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True):
@ -385,7 +385,7 @@ class Dealer:
for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql): for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql):
fld, v = r.group(1), r.group(3) fld, v = r.group(1), r.group(3)
match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format( match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format(
fld, huqie.qieqie(huqie.qie(v))) fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v)))
replaces.append( replaces.append(
("{}{}'{}'".format( ("{}{}'{}'".format(
r.group(1), r.group(1),

View File

@ -4,7 +4,7 @@ import json
import re import re
import os import os
import numpy as np import numpy as np
from rag.nlp import huqie from rag.nlp import rag_tokenizer
from api.utils.file_utils import get_project_base_directory from api.utils.file_utils import get_project_base_directory
@ -83,7 +83,7 @@ class Dealer:
txt = re.sub(p, r, txt) txt = re.sub(p, r, txt)
res = [] res = []
for t in huqie.qie(txt).split(" "): for t in rag_tokenizer.tokenize(txt).split(" "):
tk = t tk = t
if (stpwd and tk in self.stop_words) or ( if (stpwd and tk in self.stop_words) or (
re.match(r"[0-9]$", tk) and not num): re.match(r"[0-9]$", tk) and not num):
@ -161,7 +161,7 @@ class Dealer:
return m[self.ne[t]] return m[self.ne[t]]
def postag(t): def postag(t):
t = huqie.tag(t) t = rag_tokenizer.tag(t)
if t in set(["r", "c", "d"]): if t in set(["r", "c", "d"]):
return 0.3 return 0.3
if t in set(["ns", "nt"]): if t in set(["ns", "nt"]):
@ -175,14 +175,14 @@ class Dealer:
def freq(t): def freq(t):
if re.match(r"[0-9. -]{2,}$", t): if re.match(r"[0-9. -]{2,}$", t):
return 3 return 3
s = huqie.freq(t) s = rag_tokenizer.freq(t)
if not s and re.match(r"[a-z. -]+$", t): if not s and re.match(r"[a-z. -]+$", t):
return 300 return 300
if not s: if not s:
s = 0 s = 0
if not s and len(t) >= 4: if not s and len(t) >= 4:
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
if len(s) > 1: if len(s) > 1:
s = np.min([freq(tt) for tt in s]) / 6. s = np.min([freq(tt) for tt in s]) / 6.
else: else:
@ -198,7 +198,7 @@ class Dealer:
elif re.match(r"[a-z. -]+$", t): elif re.match(r"[a-z. -]+$", t):
return 300 return 300
elif len(t) >= 4: elif len(t) >= 4:
s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
if len(s) > 1: if len(s) > 1:
return max(3, np.min([df(tt) for tt in s]) / 6.) return max(3, np.min([df(tt) for tt in s]) / 6.)

View File

@ -4,13 +4,14 @@ import traceback
from api.db.db_models import close_connection from api.db.db_models import close_connection
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from rag.settings import cron_logger
from rag.utils.minio_conn import MINIO from rag.utils.minio_conn import MINIO
from rag.utils.redis_conn import REDIS_CONN from rag.utils.redis_conn import REDIS_CONN
def collect(): def collect():
doc_locations = TaskService.get_ongoing_doc_name() doc_locations = TaskService.get_ongoing_doc_name()
#print(tasks) print(doc_locations)
if len(doc_locations) == 0: if len(doc_locations) == 0:
time.sleep(1) time.sleep(1)
return return
@ -28,7 +29,7 @@ def main():
if REDIS_CONN.exist(key):continue if REDIS_CONN.exist(key):continue
file_bin = MINIO.get(kb_id, loc) file_bin = MINIO.get(kb_id, loc)
REDIS_CONN.transaction(key, file_bin, 12 * 60) REDIS_CONN.transaction(key, file_bin, 12 * 60)
print("CACHE:", loc) cron_logger.info("CACHE: {}".format(loc))
except Exception as e: except Exception as e:
traceback.print_stack(e) traceback.print_stack(e)
except Exception as e: except Exception as e:

View File

@ -21,7 +21,6 @@ from datetime import datetime
from api.db.db_models import Task from api.db.db_models import Task
from api.db.db_utils import bulk_insert_into_db from api.db.db_utils import bulk_insert_into_db
from api.db.services.file2document_service import File2DocumentService from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from deepdoc.parser import PdfParser from deepdoc.parser import PdfParser
from deepdoc.parser.excel_parser import RAGFlowExcelParser from deepdoc.parser.excel_parser import RAGFlowExcelParser