mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 21:15:54 +08:00
Enlarge the term weight difference (#3435)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
6d451dbe06
commit
ca9e97d2f2
@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License
|
||||
#
|
||||
import os.path
|
||||
import pathlib
|
||||
import re
|
||||
|
||||
@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
|
||||
from api.settings import RetCode, docStoreConn
|
||||
from api.utils.api_utils import get_json_result
|
||||
from rag.utils.storage_factory import STORAGE_IMPL
|
||||
from api.utils.file_utils import filename_type, thumbnail
|
||||
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
||||
from api.utils.web_utils import html2pdf, is_valid_url
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
|
||||
@ -529,15 +530,25 @@ def parse():
|
||||
if not is_valid_url(url):
|
||||
return get_json_result(
|
||||
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
||||
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
||||
os.makedirs(download_path, exist_ok=True)
|
||||
from selenium.webdriver import Chrome, ChromeOptions
|
||||
options = ChromeOptions()
|
||||
options.add_argument('--headless')
|
||||
options.add_argument('--disable-gpu')
|
||||
options.add_argument('--no-sandbox')
|
||||
options.add_argument('--disable-dev-shm-usage')
|
||||
options.add_experimental_option('prefs', {
|
||||
'download.default_directory': download_path,
|
||||
'download.prompt_for_download': False,
|
||||
'download.directory_upgrade': True,
|
||||
'safebrowsing.enabled': True
|
||||
})
|
||||
driver = Chrome(options=options)
|
||||
driver.get(url)
|
||||
print(driver.get_downloadable_files())
|
||||
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
||||
driver.close()
|
||||
return get_json_result(data="\n".join(sections))
|
||||
|
||||
if 'file' not in request.files:
|
||||
|
@ -66,7 +66,7 @@ class FulltextQueryer:
|
||||
|
||||
def question(self, txt, tbl="qa", min_match:float=0.6):
|
||||
txt = re.sub(
|
||||
r"[ :\r\n\t,,。??/`!!&\^%%()^]+",
|
||||
r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+",
|
||||
" ",
|
||||
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
||||
).strip()
|
||||
|
@ -228,6 +228,7 @@ class Dealer:
|
||||
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
||||
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
||||
np.array([ner(t) * postag(t) for t in tks])
|
||||
wts = [math.exp(s) for s in wts]
|
||||
tw = list(zip(tks, wts))
|
||||
else:
|
||||
for tk in tks:
|
||||
@ -236,6 +237,7 @@ class Dealer:
|
||||
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
||||
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
||||
np.array([ner(t) * postag(t) for t in tt])
|
||||
wts = [math.exp(s) for s in wts]
|
||||
tw.extend(zip(tt, wts))
|
||||
|
||||
S = np.sum([s for _, s in tw])
|
||||
|
Loading…
x
Reference in New Issue
Block a user