From 8c07992b6c28f82ae0a493a98d3b7d60f599496a Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Sun, 28 Apr 2024 19:13:33 +0800 Subject: [PATCH] refine code (#595) ### What problem does this PR solve? ### Type of change - [x] Refactoring --- api/apps/chunk_app.py | 16 +- api/db/services/task_service.py | 5 +- deepdoc/parser/docx_parser.py | 6 +- deepdoc/parser/pdf_parser.py | 14 +- .../parser/resume/entities/corporations.py | 6 +- deepdoc/parser/resume/step_two.py | 52 +-- deepdoc/vision/table_structure_recognizer.py | 6 +- rag/app/book.py | 6 +- rag/app/laws.py | 6 +- rag/app/manual.py | 6 +- rag/app/naive.py | 6 +- rag/app/one.py | 6 +- rag/app/paper.py | 10 +- rag/app/presentation.py | 6 +- rag/app/qa.py | 8 +- rag/app/resume.py | 12 +- rag/app/table.py | 6 +- rag/nlp/__init__.py | 6 +- rag/nlp/query.py | 16 +- rag/nlp/rag_tokenizer.py | 423 ++++++++++++++++++ rag/nlp/search.py | 14 +- rag/nlp/term_weight.py | 12 +- rag/svr/cache_file_svr.py | 5 +- rag/svr/task_broker.py | 1 - 24 files changed, 538 insertions(+), 116 deletions(-) create mode 100644 rag/nlp/rag_tokenizer.py diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 2929e8892..4555dec1a 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -20,7 +20,7 @@ from flask_login import login_required, current_user from elasticsearch_dsl import Q from rag.app.qa import rmPrefix, beAdoc -from rag.nlp import search, huqie +from rag.nlp import search, rag_tokenizer from rag.utils.es_conn import ELASTICSEARCH from rag.utils import rmSpace from api.db import LLMType, ParserType @@ -125,10 +125,10 @@ def set(): d = { "id": req["chunk_id"], "content_with_weight": req["content_with_weight"]} - d["content_ltks"] = huqie.qie(req["content_with_weight"]) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + d["content_ltks"] = rag_tokenizer.tokenize(req["content_with_weight"]) + d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["important_kwd"] = req["important_kwd"] - d["important_tks"] = huqie.qie(" ".join(req["important_kwd"])) + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req["important_kwd"])) if "available_int" in req: d["available_int"] = req["available_int"] @@ -152,7 +152,7 @@ def set(): retmsg="Q&A must be separated by TAB/ENTER key.") q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] d = beAdoc(d, arr[0], arr[1], not any( - [huqie.is_chinese(t) for t in q + a])) + [rag_tokenizer.is_chinese(t) for t in q + a])) v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] @@ -202,11 +202,11 @@ def create(): md5 = hashlib.md5() md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) chunck_id = md5.hexdigest() - d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), + d = {"id": chunck_id, "content_ltks": rag_tokenizer.tokenize(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]} - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) d["important_kwd"] = req.get("important_kwd", []) - d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) + d["important_tks"] = rag_tokenizer.tokenize(" ".join(req.get("important_kwd", []))) d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19] d["create_timestamp_flt"] = datetime.datetime.now().timestamp() diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index ccc837a03..083847509 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -78,14 +78,13 @@ class TaskService(CommonService): docs = cls.model.select(*[Document.id, Document.kb_id, Document.location, File.parent_id]) \ .join(Document, on=(cls.model.doc_id == Document.id)) \ .join(File2Document, on=(File2Document.document_id == Document.id), join_type=JOIN.LEFT_OUTER) \ - .join(File, on=(File2Document.file_id == File.id)) \ + .join(File, on=(File2Document.file_id == File.id), join_type=JOIN.LEFT_OUTER) \ .where( Document.status == StatusEnum.VALID.value, Document.run == TaskStatus.RUNNING.value, ~(Document.type == FileType.VIRTUAL.value), - cls.model.progress >= 0, cls.model.progress < 1, - cls.model.create_time >= current_timestamp() - 180000 + cls.model.create_time >= current_timestamp() - 1000 * 600 ) docs = list(docs.dicts()) if not docs: return [] diff --git a/deepdoc/parser/docx_parser.py b/deepdoc/parser/docx_parser.py index e45b5d51a..42923f0fc 100644 --- a/deepdoc/parser/docx_parser.py +++ b/deepdoc/parser/docx_parser.py @@ -3,7 +3,7 @@ from docx import Document import re import pandas as pd from collections import Counter -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from io import BytesIO @@ -35,14 +35,14 @@ class RAGFlowDocxParser: for p, n in patt: if re.search(p, b): return n - tks = [t for t in huqie.qie(b).split(" ") if len(t) > 1] + tks = [t for t in rag_tokenizer.tokenize(b).split(" ") if len(t) > 1] if len(tks) > 3: if len(tks) < 12: return "Tx" else: return "Lx" - if len(tks) == 1 and huqie.tag(tks[0]) == "nr": + if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": return "Nr" return "Ot" diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 4c3255c70..8c0ac0045 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -16,7 +16,7 @@ from PyPDF2 import PdfReader as pdf2_read from api.utils.file_utils import get_project_base_directory from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from copy import deepcopy from huggingface_hub import snapshot_download @@ -95,13 +95,13 @@ class RAGFlowPdfParser: h = max(self.__height(up), self.__height(down)) y_dis = self._y_dis(up, down) LEN = 6 - tks_down = huqie.qie(down["text"][:LEN]).split(" ") - tks_up = huqie.qie(up["text"][-LEN:]).split(" ") + tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ") + tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ") tks_all = up["text"][-LEN:].strip() \ + (" " if re.match(r"[a-zA-Z0-9]+", up["text"][-1] + down["text"][0]) else "") \ + down["text"][:LEN].strip() - tks_all = huqie.qie(tks_all).split(" ") + tks_all = rag_tokenizer.tokenize(tks_all).split(" ") fea = [ up.get("R", -1) == down.get("R", -1), y_dis / h, @@ -142,8 +142,8 @@ class RAGFlowPdfParser: tks_down[-1] == tks_up[-1], max(down["in_row"], up["in_row"]), abs(down["in_row"] - up["in_row"]), - len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0, - len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0 + len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0, + len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0 ] return fea @@ -599,7 +599,7 @@ class RAGFlowPdfParser: if b["text"].strip()[0] != b_["text"].strip()[0] \ or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \ - or huqie.is_chinese(b["text"].strip()[0]) \ + or rag_tokenizer.is_chinese(b["text"].strip()[0]) \ or b["top"] > b_["bottom"]: i += 1 continue diff --git a/deepdoc/parser/resume/entities/corporations.py b/deepdoc/parser/resume/entities/corporations.py index 6b7b38a5b..54970866f 100644 --- a/deepdoc/parser/resume/entities/corporations.py +++ b/deepdoc/parser/resume/entities/corporations.py @@ -1,6 +1,6 @@ import re,json,os import pandas as pd -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from . import regions current_file_path = os.path.dirname(os.path.abspath(__file__)) GOODS = pd.read_csv(os.path.join(current_file_path, "res/corp_baike_len.csv"), sep="\t", header=0).fillna(0) @@ -22,14 +22,14 @@ def baike(cid, default_v=0): def corpNorm(nm, add_region=True): global CORP_TKS if not nm or type(nm)!=type(""):return "" - nm = huqie.tradi2simp(huqie.strQ2B(nm)).lower() + nm = rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(nm)).lower() nm = re.sub(r"&", "&", nm) nm = re.sub(r"[\(\)()\+'\"\t \*\\【】-]+", " ", nm) nm = re.sub(r"([—-]+.*| +co\..*|corp\..*| +inc\..*| +ltd.*)", "", nm, 10000, re.IGNORECASE) nm = re.sub(r"(计算机|技术|(技术|科技|网络)*有限公司|公司|有限|研发中心|中国|总部)$", "", nm, 10000, re.IGNORECASE) if not nm or (len(nm)<5 and not regions.isName(nm[0:2])):return nm - tks = huqie.qie(nm).split(" ") + tks = rag_tokenizer.tokenize(nm).split(" ") reg = [t for i,t in enumerate(tks) if regions.isName(t) and (t != "中国" or i > 0)] nm = "" for t in tks: diff --git a/deepdoc/parser/resume/step_two.py b/deepdoc/parser/resume/step_two.py index 20b0223a2..ff6116577 100644 --- a/deepdoc/parser/resume/step_two.py +++ b/deepdoc/parser/resume/step_two.py @@ -3,7 +3,7 @@ import re, copy, time, datetime, demjson3, \ traceback, signal import numpy as np from deepdoc.parser.resume.entities import degrees, schools, corporations -from rag.nlp import huqie, surname +from rag.nlp import rag_tokenizer, surname from xpinyin import Pinyin from contextlib import contextmanager @@ -83,7 +83,7 @@ def forEdu(cv): if n.get("school_name") and isinstance(n["school_name"], str): sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) e["sch_nm_kwd"] = sch[-1] - fea.append(huqie.qieqie(huqie.qie(n.get("school_name", ""))).split(" ")[-1]) + fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1]) if n.get("discipline_name") and isinstance(n["discipline_name"], str): maj.append(n["discipline_name"]) @@ -166,10 +166,10 @@ def forEdu(cv): if "tag_kwd" not in cv: cv["tag_kwd"] = [] if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") - if cv.get("major_kwd"): cv["major_tks"] = huqie.qie(" ".join(maj)) - if cv.get("school_name_kwd"): cv["school_name_tks"] = huqie.qie(" ".join(sch)) - if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = huqie.qie(" ".join(fsch)) - if cv.get("first_major_kwd"): cv["first_major_tks"] = huqie.qie(" ".join(fmaj)) + if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj)) + if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch)) + if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch)) + if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj)) return cv @@ -187,11 +187,11 @@ def forProj(cv): if n.get("achivement"): desc.append(str(n["achivement"])) if pro_nms: - # cv["pro_nms_tks"] = huqie.qie(" ".join(pro_nms)) - cv["project_name_tks"] = huqie.qie(pro_nms[0]) + # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms)) + cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0]) if desc: - cv["pro_desc_ltks"] = huqie.qie(rmHtmlTag(" ".join(desc))) - cv["project_desc_ltks"] = huqie.qie(rmHtmlTag(desc[0])) + cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc))) + cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0])) return cv @@ -280,25 +280,25 @@ def forWork(cv): if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] if fea["position_name"]: - cv["position_name_tks"] = huqie.qie(fea["position_name"][0]) - cv["position_name_sm_tks"] = huqie.qieqie(cv["position_name_tks"]) - cv["pos_nm_tks"] = huqie.qie(" ".join(fea["position_name"][1:])) + cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0]) + cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"]) + cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:])) if fea["industry_name"]: - cv["industry_name_tks"] = huqie.qie(fea["industry_name"][0]) - cv["industry_name_sm_tks"] = huqie.qieqie(cv["industry_name_tks"]) - cv["indu_nm_tks"] = huqie.qie(" ".join(fea["industry_name"][1:])) + cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0]) + cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"]) + cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:])) if fea["corporation_name"]: cv["corporation_name_kwd"] = fea["corporation_name"][0] cv["corp_nm_kwd"] = fea["corporation_name"] - cv["corporation_name_tks"] = huqie.qie(fea["corporation_name"][0]) - cv["corporation_name_sm_tks"] = huqie.qieqie(cv["corporation_name_tks"]) - cv["corp_nm_tks"] = huqie.qie(" ".join(fea["corporation_name"][1:])) + cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0]) + cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"]) + cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:])) if fea["responsibilities"]: - cv["responsibilities_ltks"] = huqie.qie(fea["responsibilities"][0]) - cv["resp_ltks"] = huqie.qie(" ".join(fea["responsibilities"][1:])) + cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0]) + cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:])) if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if re.match(r"[^0-9]+$", str(i))] @@ -444,15 +444,15 @@ def parse(cv): if nms: t = k[:-4] cv[f"{t}_kwd"] = nms - cv[f"{t}_tks"] = huqie.qie(" ".join(nms)) + cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms)) except Exception as e: print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) cv[k] = [] # tokenize fields if k in tks_fld: - cv[f"{k}_tks"] = huqie.qie(cv[k]) - if k in small_tks_fld: cv[f"{k}_sm_tks"] = huqie.qie(cv[f"{k}_tks"]) + cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k]) + if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"]) # keyword fields if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() @@ -492,7 +492,7 @@ def parse(cv): cv["name_kwd"] = name cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] cv["name_tks"] = ( - huqie.qie(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") + rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") ) if name else "" else: cv["integerity_flt"] /= 2. @@ -515,7 +515,7 @@ def parse(cv): cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) # long text tokenize - if cv.get("responsibilities"): cv["responsibilities_ltks"] = huqie.qie(rmHtmlTag(cv["responsibilities"])) + if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"])) # for yes or no field fea = [] diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index 548eb6204..70d750bd9 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -19,7 +19,7 @@ import numpy as np from huggingface_hub import snapshot_download from api.utils.file_utils import get_project_base_directory -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from .recognizer import Recognizer @@ -117,14 +117,14 @@ class TableStructureRecognizer(Recognizer): for p, n in patt: if re.search(p, b["text"].strip()): return n - tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1] + tks = [t for t in rag_tokenizer.tokenize(b["text"]).split(" ") if len(t) > 1] if len(tks) > 3: if len(tks) < 12: return "Tx" else: return "Lx" - if len(tks) == 1 and huqie.tag(tks[0]) == "nr": + if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr": return "Nr" return "Ot" diff --git a/rag/app/book.py b/rag/app/book.py index 732ba8819..70aee29c2 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -18,7 +18,7 @@ from io import BytesIO from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \ tokenize_chunks, find_codec -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, DocxParser, PlainParser @@ -63,9 +63,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, """ doc = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) pdf_parser = None sections, tbls = [], [] if re.search(r"\.docx$", filename, re.IGNORECASE): diff --git a/rag/app/laws.py b/rag/app/laws.py index 65346576e..473eca9c7 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -19,7 +19,7 @@ from docx import Document from api.db import ParserType from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, hierarchical_merge, \ make_colon_as_title, add_positions, tokenize_chunks, find_codec -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, DocxParser, PlainParser from rag.settings import cron_logger @@ -89,9 +89,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, """ doc = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) pdf_parser = None sections = [] if re.search(r"\.docx$", filename, re.IGNORECASE): diff --git a/rag/app/manual.py b/rag/app/manual.py index f1a6c55f9..e1115489e 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -2,7 +2,7 @@ import copy import re from api.db import ParserType -from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from deepdoc.parser import PdfParser, PlainParser from rag.utils import num_tokens_from_string @@ -70,8 +70,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc = { "docnm_kwd": filename } - doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) # is it English eng = lang.lower() == "english" # pdf_parser.is_english diff --git a/rag/app/naive.py b/rag/app/naive.py index d341b8c65..55fab84c2 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -16,7 +16,7 @@ from docx import Document from timeit import default_timer as timer import re from deepdoc.parser.pdf_parser import PlainParser -from rag.nlp import huqie, naive_merge, tokenize_table, tokenize_chunks, find_codec +from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec from deepdoc.parser import PdfParser, ExcelParser, DocxParser from rag.settings import cron_logger @@ -112,9 +112,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "chunk_token_num": 128, "delimiter": "\n!?。;!?", "layout_recognize": True}) doc = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] pdf_parser = None sections = [] diff --git a/rag/app/one.py b/rag/app/one.py index 959eb2894..f5c78f5aa 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -14,7 +14,7 @@ from tika import parser from io import BytesIO import re from rag.app import laws -from rag.nlp import huqie, tokenize, find_codec +from rag.nlp import rag_tokenizer, tokenize, find_codec from deepdoc.parser import PdfParser, ExcelParser, PlainParser @@ -111,9 +111,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) tokenize(doc, "\n".join(sections), eng) return [doc] diff --git a/rag/app/paper.py b/rag/app/paper.py index a667dc0a7..1b3c1df8a 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -15,7 +15,7 @@ import re from collections import Counter from api.db import ParserType -from rag.nlp import huqie, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from deepdoc.parser import PdfParser, PlainParser import numpy as np from rag.utils import num_tokens_from_string @@ -153,10 +153,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, else: raise NotImplementedError("file type not supported yet(pdf supported)") - doc = {"docnm_kwd": filename, "authors_tks": huqie.qie(paper["authors"]), - "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)} - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) - doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"]) + doc = {"docnm_kwd": filename, "authors_tks": rag_tokenizer.tokenize(paper["authors"]), + "title_tks": rag_tokenizer.tokenize(paper["title"] if paper["title"] else filename)} + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) + doc["authors_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["authors_tks"]) # is it English eng = lang.lower() == "english" # pdf_parser.is_english print("It's English.....", eng) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index ef70bf2da..b6cf710b9 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -17,7 +17,7 @@ from io import BytesIO from PIL import Image from rag.nlp import tokenize, is_english -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, PptParser, PlainParser from PyPDF2 import PdfReader as pdf2_read @@ -96,9 +96,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, eng = lang.lower() == "english" doc = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): ppt_parser = Ppt() diff --git a/rag/app/qa.py b/rag/app/qa.py index b29752e3d..e07a4f5b0 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -16,7 +16,7 @@ from io import BytesIO from nltk import word_tokenize from openpyxl import load_workbook from rag.nlp import is_english, random_choices, find_codec -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from deepdoc.parser import ExcelParser @@ -73,8 +73,8 @@ def beAdoc(d, q, a, eng): aprefix = "Answer: " if eng else "回答:" d["content_with_weight"] = "\t".join( [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) - d["content_ltks"] = huqie.qie(q) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + d["content_ltks"] = rag_tokenizer.tokenize(q) + d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) return d @@ -94,7 +94,7 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): res = [] doc = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } if re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/rag/app/resume.py b/rag/app/resume.py index d341bcba0..95c19fa7d 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -18,7 +18,7 @@ import re import pandas as pd import requests from api.db.services.knowledgebase_service import KnowledgebaseService -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from deepdoc.parser.resume import refactor from deepdoc.parser.resume import step_one, step_two from rag.settings import cron_logger @@ -131,9 +131,9 @@ def chunk(filename, binary=None, callback=None, **kwargs): titles.append(str(v)) doc = { "docnm_kwd": filename, - "title_tks": huqie.qie("-".join(titles) + "-简历") + "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历") } - doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"]) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) pairs = [] for n, m in field_map.items(): if not resume.get(n): @@ -147,8 +147,8 @@ def chunk(filename, binary=None, callback=None, **kwargs): doc["content_with_weight"] = "\n".join( ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) - doc["content_ltks"] = huqie.qie(doc["content_with_weight"]) - doc["content_sm_ltks"] = huqie.qieqie(doc["content_ltks"]) + doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"]) + doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"]) for n, _ in field_map.items(): if n not in resume: continue @@ -156,7 +156,7 @@ def chunk(filename, binary=None, callback=None, **kwargs): len(resume[n]) == 1 or n not in forbidden_select_fields4resume): resume[n] = resume[n][0] if n.find("_tks") > 0: - resume[n] = huqie.qieqie(resume[n]) + resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n]) doc[n] = resume[n] print(doc) diff --git a/rag/app/table.py b/rag/app/table.py index f62edcce1..039a4787f 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -20,7 +20,7 @@ from openpyxl import load_workbook from dateutil.parser import parse as datetime_parse from api.db.services.knowledgebase_service import KnowledgebaseService -from rag.nlp import huqie, is_english, tokenize, find_codec +from rag.nlp import rag_tokenizer, is_english, tokenize, find_codec from deepdoc.parser import ExcelParser @@ -216,7 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, for ii, row in df.iterrows(): d = { "docnm_kwd": filename, - "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename)) + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) } row_txt = [] for j in range(len(clmns)): @@ -227,7 +227,7 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000, if pd.isna(row[clmns[j]]): continue fld = clmns_map[j][0] - d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie( + d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else rag_tokenizer.tokenize( row[clmns[j]]) row_txt.append("{}:{}".format(clmns[j], row[clmns[j]])) if not row_txt: diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 0d0988f81..61ba8401b 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -2,7 +2,7 @@ import random from collections import Counter from rag.utils import num_tokens_from_string -from . import huqie +from . import rag_tokenizer import re import copy @@ -109,8 +109,8 @@ def is_english(texts): def tokenize(d, t, eng): d["content_with_weight"] = t t = re.sub(r"]{0,12})?>", " ", t) - d["content_ltks"] = huqie.qie(t) - d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) + d["content_ltks"] = rag_tokenizer.tokenize(t) + d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) def tokenize_chunks(chunks, doc, eng, pdf_parser): diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 9d9c85573..ce7476257 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -7,7 +7,7 @@ import logging import copy from elasticsearch_dsl import Q -from rag.nlp import huqie, term_weight, synonym +from rag.nlp import rag_tokenizer, term_weight, synonym class EsQueryer: @@ -47,13 +47,13 @@ class EsQueryer: txt = re.sub( r"[ \r\n\t,,。??/`!!&]+", " ", - huqie.tradi2simp( - huqie.strQ2B( + rag_tokenizer.tradi2simp( + rag_tokenizer.strQ2B( txt.lower()))).strip() txt = EsQueryer.rmWWW(txt) if not self.isChinese(txt): - tks = huqie.qie(txt).split(" ") + tks = rag_tokenizer.tokenize(txt).split(" ") q = copy.deepcopy(tks) for i in range(1, len(tks)): q.append("\"%s %s\"^2" % (tks[i - 1], tks[i])) @@ -65,7 +65,7 @@ class EsQueryer: boost=1)#, minimum_should_match=min_match) ), tks - def needQieqie(tk): + def need_fine_grained_tokenize(tk): if len(tk) < 4: return False if re.match(r"[0-9a-z\.\+#_\*-]+$", tk): @@ -81,7 +81,7 @@ class EsQueryer: logging.info(json.dumps(twts, ensure_ascii=False)) tms = [] for tk, w in sorted(twts, key=lambda x: x[1] * -1): - sm = huqie.qieqie(tk).split(" ") if needQieqie(tk) else [] + sm = rag_tokenizer.fine_grained_tokenize(tk).split(" ") if need_fine_grained_tokenize(tk) else [] sm = [ re.sub( r"[ ,\./;'\[\]\\`~!@#$%\^&\*\(\)=\+_<>\?:\"\{\}\|,。;‘’【】、!¥……()——《》?:“”-]+", @@ -110,10 +110,10 @@ class EsQueryer: if len(twts) > 1: tms += f" (\"%s\"~4)^1.5" % (" ".join([t for t, _ in twts])) if re.match(r"[0-9a-z ]+$", tt): - tms = f"(\"{tt}\" OR \"%s\")" % huqie.qie(tt) + tms = f"(\"{tt}\" OR \"%s\")" % rag_tokenizer.tokenize(tt) syns = " OR ".join( - ["\"%s\"^0.7" % EsQueryer.subSpecialChar(huqie.qie(s)) for s in syns]) + ["\"%s\"^0.7" % EsQueryer.subSpecialChar(rag_tokenizer.tokenize(s)) for s in syns]) if syns: tms = f"({tms})^5 OR ({syns})^0.7" diff --git a/rag/nlp/rag_tokenizer.py b/rag/nlp/rag_tokenizer.py new file mode 100644 index 000000000..abd9146a9 --- /dev/null +++ b/rag/nlp/rag_tokenizer.py @@ -0,0 +1,423 @@ +# -*- coding: utf-8 -*- + +import copy +import datrie +import math +import os +import re +import string +import sys +from hanziconv import HanziConv +from huggingface_hub import snapshot_download +from nltk import word_tokenize +from nltk.stem import PorterStemmer, WordNetLemmatizer +from api.utils.file_utils import get_project_base_directory + + +class RagTokenizer: + def key_(self, line): + return str(line.lower().encode("utf-8"))[2:-1] + + def rkey_(self, line): + return str(("DD" + (line[::-1].lower())).encode("utf-8"))[2:-1] + + def loadDict_(self, fnm): + print("[HUQIE]:Build trie", fnm, file=sys.stderr) + try: + of = open(fnm, "r") + while True: + line = of.readline() + if not line: + break + line = re.sub(r"[\r\n]+", "", line) + line = re.split(r"[ \t]", line) + k = self.key_(line[0]) + F = int(math.log(float(line[1]) / self.DENOMINATOR) + .5) + if k not in self.trie_ or self.trie_[k][0] < F: + self.trie_[self.key_(line[0])] = (F, line[2]) + self.trie_[self.rkey_(line[0])] = 1 + self.trie_.save(fnm + ".trie") + of.close() + except Exception as e: + print("[HUQIE]:Faild to build trie, ", fnm, e, file=sys.stderr) + + def __init__(self, debug=False): + self.DEBUG = debug + self.DENOMINATOR = 1000000 + self.trie_ = datrie.Trie(string.printable) + self.DIR_ = os.path.join(get_project_base_directory(), "rag/res", "huqie") + + self.stemmer = PorterStemmer() + self.lemmatizer = WordNetLemmatizer() + + self.SPLIT_CHAR = r"([ ,\.<>/?;'\[\]\\`!@#$%^&*\(\)\{\}\|_+=《》,。?、;‘’:“”【】~!¥%……()——-]+|[a-z\.-]+|[0-9,\.-]+)" + try: + self.trie_ = datrie.Trie.load(self.DIR_ + ".txt.trie") + return + except Exception as e: + print("[HUQIE]:Build default trie", file=sys.stderr) + self.trie_ = datrie.Trie(string.printable) + + self.loadDict_(self.DIR_ + ".txt") + + def loadUserDict(self, fnm): + try: + self.trie_ = datrie.Trie.load(fnm + ".trie") + return + except Exception as e: + self.trie_ = datrie.Trie(string.printable) + self.loadDict_(fnm) + + def addUserDict(self, fnm): + self.loadDict_(fnm) + + def _strQ2B(self, ustring): + """把字符串全角转半角""" + rstring = "" + for uchar in ustring: + inside_code = ord(uchar) + if inside_code == 0x3000: + inside_code = 0x0020 + else: + inside_code -= 0xfee0 + if inside_code < 0x0020 or inside_code > 0x7e: # 转完之后不是半角字符返回原来的字符 + rstring += uchar + else: + rstring += chr(inside_code) + return rstring + + def _tradi2simp(self, line): + return HanziConv.toSimplified(line) + + def dfs_(self, chars, s, preTks, tkslist): + MAX_L = 10 + res = s + # if s > MAX_L or s>= len(chars): + if s >= len(chars): + tkslist.append(preTks) + return res + + # pruning + S = s + 1 + if s + 2 <= len(chars): + t1, t2 = "".join(chars[s:s + 1]), "".join(chars[s:s + 2]) + if self.trie_.has_keys_with_prefix(self.key_(t1)) and not self.trie_.has_keys_with_prefix( + self.key_(t2)): + S = s + 2 + if len(preTks) > 2 and len( + preTks[-1][0]) == 1 and len(preTks[-2][0]) == 1 and len(preTks[-3][0]) == 1: + t1 = preTks[-1][0] + "".join(chars[s:s + 1]) + if self.trie_.has_keys_with_prefix(self.key_(t1)): + S = s + 2 + + ################ + for e in range(S, len(chars) + 1): + t = "".join(chars[s:e]) + k = self.key_(t) + + if e > s + 1 and not self.trie_.has_keys_with_prefix(k): + break + + if k in self.trie_: + pretks = copy.deepcopy(preTks) + if k in self.trie_: + pretks.append((t, self.trie_[k])) + else: + pretks.append((t, (-12, ''))) + res = max(res, self.dfs_(chars, e, pretks, tkslist)) + + if res > s: + return res + + t = "".join(chars[s:s + 1]) + k = self.key_(t) + if k in self.trie_: + preTks.append((t, self.trie_[k])) + else: + preTks.append((t, (-12, ''))) + + return self.dfs_(chars, s + 1, preTks, tkslist) + + def freq(self, tk): + k = self.key_(tk) + if k not in self.trie_: + return 0 + return int(math.exp(self.trie_[k][0]) * self.DENOMINATOR + 0.5) + + def tag(self, tk): + k = self.key_(tk) + if k not in self.trie_: + return "" + return self.trie_[k][1] + + def score_(self, tfts): + B = 30 + F, L, tks = 0, 0, [] + for tk, (freq, tag) in tfts: + F += freq + L += 0 if len(tk) < 2 else 1 + tks.append(tk) + F /= len(tks) + L /= len(tks) + if self.DEBUG: + print("[SC]", tks, len(tks), L, F, B / len(tks) + L + F) + return tks, B / len(tks) + L + F + + def sortTks_(self, tkslist): + res = [] + for tfts in tkslist: + tks, s = self.score_(tfts) + res.append((tks, s)) + return sorted(res, key=lambda x: x[1], reverse=True) + + def merge_(self, tks): + patts = [ + (r"[ ]+", " "), + (r"([0-9\+\.,%\*=-]) ([0-9\+\.,%\*=-])", r"\1\2"), + ] + # for p,s in patts: tks = re.sub(p, s, tks) + + # if split chars is part of token + res = [] + tks = re.sub(r"[ ]+", " ", tks).split(" ") + s = 0 + while True: + if s >= len(tks): + break + E = s + 1 + for e in range(s + 2, min(len(tks) + 2, s + 6)): + tk = "".join(tks[s:e]) + if re.search(self.SPLIT_CHAR, tk) and self.freq(tk): + E = e + res.append("".join(tks[s:E])) + s = E + + return " ".join(res) + + def maxForward_(self, line): + res = [] + s = 0 + while s < len(line): + e = s + 1 + t = line[s:e] + while e < len(line) and self.trie_.has_keys_with_prefix( + self.key_(t)): + e += 1 + t = line[s:e] + + while e - 1 > s and self.key_(t) not in self.trie_: + e -= 1 + t = line[s:e] + + if self.key_(t) in self.trie_: + res.append((t, self.trie_[self.key_(t)])) + else: + res.append((t, (0, ''))) + + s = e + + return self.score_(res) + + def maxBackward_(self, line): + res = [] + s = len(line) - 1 + while s >= 0: + e = s + 1 + t = line[s:e] + while s > 0 and self.trie_.has_keys_with_prefix(self.rkey_(t)): + s -= 1 + t = line[s:e] + + while s + 1 < e and self.key_(t) not in self.trie_: + s += 1 + t = line[s:e] + + if self.key_(t) in self.trie_: + res.append((t, self.trie_[self.key_(t)])) + else: + res.append((t, (0, ''))) + + s -= 1 + + return self.score_(res[::-1]) + + def tokenize(self, line): + line = self._strQ2B(line).lower() + line = self._tradi2simp(line) + zh_num = len([1 for c in line if is_chinese(c)]) + if zh_num < len(line) * 0.2: + return " ".join([self.stemmer.stem(self.lemmatizer.lemmatize(t)) for t in word_tokenize(line)]) + + arr = re.split(self.SPLIT_CHAR, line) + res = [] + for L in arr: + if len(L) < 2 or re.match( + r"[a-z\.-]+$", L) or re.match(r"[0-9\.-]+$", L): + res.append(L) + continue + # print(L) + + # use maxforward for the first time + tks, s = self.maxForward_(L) + tks1, s1 = self.maxBackward_(L) + if self.DEBUG: + print("[FW]", tks, s) + print("[BW]", tks1, s1) + + diff = [0 for _ in range(max(len(tks1), len(tks)))] + for i in range(min(len(tks1), len(tks))): + if tks[i] != tks1[i]: + diff[i] = 1 + + if s1 > s: + tks = tks1 + + i = 0 + while i < len(tks): + s = i + while s < len(tks) and diff[s] == 0: + s += 1 + if s == len(tks): + res.append(" ".join(tks[i:])) + break + if s > i: + res.append(" ".join(tks[i:s])) + + e = s + while e < len(tks) and e - s < 5 and diff[e] == 1: + e += 1 + + tkslist = [] + self.dfs_("".join(tks[s:e + 1]), 0, [], tkslist) + res.append(" ".join(self.sortTks_(tkslist)[0][0])) + + i = e + 1 + + res = " ".join(res) + if self.DEBUG: + print("[TKS]", self.merge_(res)) + return self.merge_(res) + + def fine_grained_tokenize(self, tks): + tks = tks.split(" ") + zh_num = len([1 for c in tks if c and is_chinese(c[0])]) + if zh_num < len(tks) * 0.2: + res = [] + for tk in tks: + res.extend(tk.split("/")) + return " ".join(res) + + res = [] + for tk in tks: + if len(tk) < 3 or re.match(r"[0-9,\.-]+$", tk): + res.append(tk) + continue + tkslist = [] + if len(tk) > 10: + tkslist.append(tk) + else: + self.dfs_(tk, 0, [], tkslist) + if len(tkslist) < 2: + res.append(tk) + continue + stk = self.sortTks_(tkslist)[1][0] + if len(stk) == len(tk): + stk = tk + else: + if re.match(r"[a-z\.-]+$", tk): + for t in stk: + if len(t) < 3: + stk = tk + break + else: + stk = " ".join(stk) + else: + stk = " ".join(stk) + + res.append(stk) + + return " ".join(res) + + +def is_chinese(s): + if s >= u'\u4e00' and s <= u'\u9fa5': + return True + else: + return False + + +def is_number(s): + if s >= u'\u0030' and s <= u'\u0039': + return True + else: + return False + + +def is_alphabet(s): + if (s >= u'\u0041' and s <= u'\u005a') or ( + s >= u'\u0061' and s <= u'\u007a'): + return True + else: + return False + + +def naiveQie(txt): + tks = [] + for t in txt.split(" "): + if tks and re.match(r".*[a-zA-Z]$", tks[-1] + ) and re.match(r".*[a-zA-Z]$", t): + tks.append(" ") + tks.append(t) + return tks + + +tokenizer = RagTokenizer() +tokenize = tokenizer.tokenize +fine_grained_tokenize = tokenizer.fine_grained_tokenize +tag = tokenizer.tag +freq = tokenizer.freq +loadUserDict = tokenizer.loadUserDict +addUserDict = tokenizer.addUserDict +tradi2simp = tokenizer._tradi2simp +strQ2B = tokenizer._strQ2B + +if __name__ == '__main__': + tknzr = RagTokenizer(debug=True) + # huqie.addUserDict("/tmp/tmp.new.tks.dict") + tks = tknzr.tokenize( + "哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈哈") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize( + "公开征求意见稿提出,境外投资者可使用自有人民币或外汇投资。使用外汇投资的,可通过债券持有人在香港人民币业务清算行及香港地区经批准可进入境内银行间外汇市场进行交易的境外人民币业务参加行(以下统称香港结算行)办理外汇资金兑换。香港结算行由此所产生的头寸可到境内银行间外汇市场平盘。使用外汇投资的,在其投资的债券到期或卖出后,原则上应兑换回外汇。") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize( + "多校划片就是一个小区对应多个小学初中,让买了学区房的家庭也不确定到底能上哪个学校。目的是通过这种方式为学区房降温,把就近入学落到实处。南京市长江大桥") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize( + "实际上当时他们已经将业务中心偏移到安全部门和针对政府企业的部门 Scripts are compiled and cached aaaaaaaaa") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize("虽然我不怎么玩") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize("蓝月亮如何在外资夹击中生存,那是全宇宙最有意思的") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize( + "涡轮增压发动机num最大功率,不像别的共享买车锁电子化的手段,我们接过来是否有意义,黄黄爱美食,不过,今天阿奇要讲到的这家农贸市场,说实话,还真蛮有特色的!不仅环境好,还打出了") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize("这周日你去吗?这周日你有空吗?") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize("Unity3D开发经验 测试开发工程师 c++双11双11 985 211 ") + print(tknzr.fine_grained_tokenize(tks)) + tks = tknzr.tokenize( + "数据分析项目经理|数据分析挖掘|数据分析方向|商品数据分析|搜索数据分析 sql python hive tableau Cocos2d-") + print(tknzr.fine_grained_tokenize(tks)) + if len(sys.argv) < 2: + sys.exit() + tknzr.DEBUG = False + tknzr.loadUserDict(sys.argv[1]) + of = open(sys.argv[2], "r") + while True: + line = of.readline() + if not line: + break + print(tknzr.tokenize(line)) + of.close() diff --git a/rag/nlp/search.py b/rag/nlp/search.py index 5400cbecc..fcf2254f5 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -9,7 +9,7 @@ from dataclasses import dataclass from rag.settings import es_logger from rag.utils import rmSpace -from rag.nlp import huqie, query +from rag.nlp import rag_tokenizer, query import numpy as np @@ -128,7 +128,7 @@ class Dealer: kwds = set([]) for k in keywords: kwds.add(k) - for kk in huqie.qieqie(k).split(" "): + for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "): if len(kk) < 2: continue if kk in kwds: @@ -243,7 +243,7 @@ class Dealer: assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format( len(ans_v[0]), len(chunk_v[0])) - chunks_tks = [huqie.qie(self.qryr.rmWWW(ck)).split(" ") + chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ") for ck in chunks] cites = {} thr = 0.63 @@ -251,7 +251,7 @@ class Dealer: for i, a in enumerate(pieces_): sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i], chunk_v, - huqie.qie( + rag_tokenizer.tokenize( self.qryr.rmWWW(pieces_[i])).split(" "), chunks_tks, tkweight, vtweight) @@ -310,8 +310,8 @@ class Dealer: def hybrid_similarity(self, ans_embd, ins_embd, ans, inst): return self.qryr.hybrid_similarity(ans_embd, ins_embd, - huqie.qie(ans).split(" "), - huqie.qie(inst).split(" ")) + rag_tokenizer.tokenize(ans).split(" "), + rag_tokenizer.tokenize(inst).split(" ")) def retrieval(self, question, embd_mdl, tenant_id, kb_ids, page, page_size, similarity_threshold=0.2, vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True): @@ -385,7 +385,7 @@ class Dealer: for r in re.finditer(r" ([a-z_]+_l?tks)( like | ?= ?)'([^']+)'", sql): fld, v = r.group(1), r.group(3) match = " MATCH({}, '{}', 'operator=OR;minimum_should_match=30%') ".format( - fld, huqie.qieqie(huqie.qie(v))) + fld, rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(v))) replaces.append( ("{}{}'{}'".format( r.group(1), diff --git a/rag/nlp/term_weight.py b/rag/nlp/term_weight.py index f1b360446..178dafe10 100644 --- a/rag/nlp/term_weight.py +++ b/rag/nlp/term_weight.py @@ -4,7 +4,7 @@ import json import re import os import numpy as np -from rag.nlp import huqie +from rag.nlp import rag_tokenizer from api.utils.file_utils import get_project_base_directory @@ -83,7 +83,7 @@ class Dealer: txt = re.sub(p, r, txt) res = [] - for t in huqie.qie(txt).split(" "): + for t in rag_tokenizer.tokenize(txt).split(" "): tk = t if (stpwd and tk in self.stop_words) or ( re.match(r"[0-9]$", tk) and not num): @@ -161,7 +161,7 @@ class Dealer: return m[self.ne[t]] def postag(t): - t = huqie.tag(t) + t = rag_tokenizer.tag(t) if t in set(["r", "c", "d"]): return 0.3 if t in set(["ns", "nt"]): @@ -175,14 +175,14 @@ class Dealer: def freq(t): if re.match(r"[0-9. -]{2,}$", t): return 3 - s = huqie.freq(t) + s = rag_tokenizer.freq(t) if not s and re.match(r"[a-z. -]+$", t): return 300 if not s: s = 0 if not s and len(t) >= 4: - s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] + s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1] if len(s) > 1: s = np.min([freq(tt) for tt in s]) / 6. else: @@ -198,7 +198,7 @@ class Dealer: elif re.match(r"[a-z. -]+$", t): return 300 elif len(t) >= 4: - s = [tt for tt in huqie.qieqie(t).split(" ") if len(tt) > 1] + s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1] if len(s) > 1: return max(3, np.min([df(tt) for tt in s]) / 6.) diff --git a/rag/svr/cache_file_svr.py b/rag/svr/cache_file_svr.py index f63c043de..e9654818e 100644 --- a/rag/svr/cache_file_svr.py +++ b/rag/svr/cache_file_svr.py @@ -4,13 +4,14 @@ import traceback from api.db.db_models import close_connection from api.db.services.task_service import TaskService +from rag.settings import cron_logger from rag.utils.minio_conn import MINIO from rag.utils.redis_conn import REDIS_CONN def collect(): doc_locations = TaskService.get_ongoing_doc_name() - #print(tasks) + print(doc_locations) if len(doc_locations) == 0: time.sleep(1) return @@ -28,7 +29,7 @@ def main(): if REDIS_CONN.exist(key):continue file_bin = MINIO.get(kb_id, loc) REDIS_CONN.transaction(key, file_bin, 12 * 60) - print("CACHE:", loc) + cron_logger.info("CACHE: {}".format(loc)) except Exception as e: traceback.print_stack(e) except Exception as e: diff --git a/rag/svr/task_broker.py b/rag/svr/task_broker.py index c479ca75c..97fb9b75c 100644 --- a/rag/svr/task_broker.py +++ b/rag/svr/task_broker.py @@ -21,7 +21,6 @@ from datetime import datetime from api.db.db_models import Task from api.db.db_utils import bulk_insert_into_db from api.db.services.file2document_service import File2DocumentService -from api.db.services.file_service import FileService from api.db.services.task_service import TaskService from deepdoc.parser import PdfParser from deepdoc.parser.excel_parser import RAGFlowExcelParser