mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-03 13:15:12 +08:00
Refa: token similarity calculations. (#6614)
### What problem does this PR solve? #6507 ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
fe0396bbb9
commit
0758c04941
@ -16,9 +16,11 @@
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
|
import math
|
||||||
import re
|
import re
|
||||||
from rag.utils.doc_store_conn import MatchTextExpr
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from rag.utils.doc_store_conn import MatchTextExpr
|
||||||
from rag.nlp import rag_tokenizer, term_weight, synonym
|
from rag.nlp import rag_tokenizer, term_weight, synonym
|
||||||
|
|
||||||
|
|
||||||
@ -212,12 +214,11 @@ class FulltextQueryer:
|
|||||||
|
|
||||||
def token_similarity(self, atks, btkss):
|
def token_similarity(self, atks, btkss):
|
||||||
def toDict(tks):
|
def toDict(tks):
|
||||||
d = {}
|
|
||||||
if isinstance(tks, str):
|
if isinstance(tks, str):
|
||||||
tks = tks.split()
|
tks = tks.split()
|
||||||
for t, c in self.tw.weights(tks, preprocess=False):
|
d = defaultdict(int)
|
||||||
if t not in d:
|
wts = self.tw.weights(tks, preprocess=False)
|
||||||
d[t] = 0
|
for i, (t, c) in enumerate(wts):
|
||||||
d[t] += c
|
d[t] += c
|
||||||
return d
|
return d
|
||||||
|
|
||||||
@ -233,11 +234,11 @@ class FulltextQueryer:
|
|||||||
s = 1e-9
|
s = 1e-9
|
||||||
for k, v in qtwt.items():
|
for k, v in qtwt.items():
|
||||||
if k in dtwt:
|
if k in dtwt:
|
||||||
s += v # * dtwt[k]
|
s += v * dtwt[k]
|
||||||
q = 1e-9
|
q = 1e-9
|
||||||
for k, v in qtwt.items():
|
for k, v in qtwt.items():
|
||||||
q += v
|
q += v * v
|
||||||
return s / q
|
return math.sqrt(3. * (s / q / math.log10( len(dtwt.keys()) + 512 )))
|
||||||
|
|
||||||
def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30):
|
def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30):
|
||||||
if isinstance(content_tks, str):
|
if isinstance(content_tks, str):
|
||||||
|
@ -15,6 +15,7 @@
|
|||||||
#
|
#
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from collections import OrderedDict
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from rag.settings import TAG_FLD, PAGERANK_FLD
|
from rag.settings import TAG_FLD, PAGERANK_FLD
|
||||||
@ -297,7 +298,7 @@ class Dealer:
|
|||||||
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
||||||
ins_tw = []
|
ins_tw = []
|
||||||
for i in sres.ids:
|
for i in sres.ids:
|
||||||
content_ltks = sres.field[i][cfield].split()
|
content_ltks = list(OrderedDict.fromkeys(sres.field[i][cfield].split()))
|
||||||
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
||||||
question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
|
question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
|
||||||
important_kwd = sres.field[i].get("important_kwd", [])
|
important_kwd = sres.field[i].get("important_kwd", [])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user