mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 04:36:01 +08:00
Enlarge the term weight difference (#3435)
### What problem does this PR solve? ### Type of change - [x] Performance Improvement
This commit is contained in:
parent
6d451dbe06
commit
ca9e97d2f2
@ -13,6 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License
|
# limitations under the License
|
||||||
#
|
#
|
||||||
|
import os.path
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@ -36,7 +37,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
|
|||||||
from api.settings import RetCode, docStoreConn
|
from api.settings import RetCode, docStoreConn
|
||||||
from api.utils.api_utils import get_json_result
|
from api.utils.api_utils import get_json_result
|
||||||
from rag.utils.storage_factory import STORAGE_IMPL
|
from rag.utils.storage_factory import STORAGE_IMPL
|
||||||
from api.utils.file_utils import filename_type, thumbnail
|
from api.utils.file_utils import filename_type, thumbnail, get_project_base_directory
|
||||||
from api.utils.web_utils import html2pdf, is_valid_url
|
from api.utils.web_utils import html2pdf, is_valid_url
|
||||||
from api.constants import IMG_BASE64_PREFIX
|
from api.constants import IMG_BASE64_PREFIX
|
||||||
|
|
||||||
@ -529,15 +530,25 @@ def parse():
|
|||||||
if not is_valid_url(url):
|
if not is_valid_url(url):
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
data=False, message='The URL format is invalid', code=RetCode.ARGUMENT_ERROR)
|
||||||
|
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
||||||
|
os.makedirs(download_path, exist_ok=True)
|
||||||
from selenium.webdriver import Chrome, ChromeOptions
|
from selenium.webdriver import Chrome, ChromeOptions
|
||||||
options = ChromeOptions()
|
options = ChromeOptions()
|
||||||
options.add_argument('--headless')
|
options.add_argument('--headless')
|
||||||
options.add_argument('--disable-gpu')
|
options.add_argument('--disable-gpu')
|
||||||
options.add_argument('--no-sandbox')
|
options.add_argument('--no-sandbox')
|
||||||
options.add_argument('--disable-dev-shm-usage')
|
options.add_argument('--disable-dev-shm-usage')
|
||||||
|
options.add_experimental_option('prefs', {
|
||||||
|
'download.default_directory': download_path,
|
||||||
|
'download.prompt_for_download': False,
|
||||||
|
'download.directory_upgrade': True,
|
||||||
|
'safebrowsing.enabled': True
|
||||||
|
})
|
||||||
driver = Chrome(options=options)
|
driver = Chrome(options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
print(driver.get_downloadable_files())
|
||||||
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
||||||
|
driver.close()
|
||||||
return get_json_result(data="\n".join(sections))
|
return get_json_result(data="\n".join(sections))
|
||||||
|
|
||||||
if 'file' not in request.files:
|
if 'file' not in request.files:
|
||||||
|
@ -66,7 +66,7 @@ class FulltextQueryer:
|
|||||||
|
|
||||||
def question(self, txt, tbl="qa", min_match:float=0.6):
|
def question(self, txt, tbl="qa", min_match:float=0.6):
|
||||||
txt = re.sub(
|
txt = re.sub(
|
||||||
r"[ :\r\n\t,,。??/`!!&\^%%()^]+",
|
r"[ :\r\n\t,,。??/`!!&\^%%()^\[\]]+",
|
||||||
" ",
|
" ",
|
||||||
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
|
||||||
).strip()
|
).strip()
|
||||||
|
@ -228,6 +228,7 @@ class Dealer:
|
|||||||
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
idf2 = np.array([idf(df(t), 1000000000) for t in tks])
|
||||||
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
||||||
np.array([ner(t) * postag(t) for t in tks])
|
np.array([ner(t) * postag(t) for t in tks])
|
||||||
|
wts = [math.exp(s) for s in wts]
|
||||||
tw = list(zip(tks, wts))
|
tw = list(zip(tks, wts))
|
||||||
else:
|
else:
|
||||||
for tk in tks:
|
for tk in tks:
|
||||||
@ -236,6 +237,7 @@ class Dealer:
|
|||||||
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
idf2 = np.array([idf(df(t), 1000000000) for t in tt])
|
||||||
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
wts = (0.3 * idf1 + 0.7 * idf2) * \
|
||||||
np.array([ner(t) * postag(t) for t in tt])
|
np.array([ner(t) * postag(t) for t in tt])
|
||||||
|
wts = [math.exp(s) for s in wts]
|
||||||
tw.extend(zip(tt, wts))
|
tw.extend(zip(tt, wts))
|
||||||
|
|
||||||
S = np.sum([s for _, s in tw])
|
S = np.sum([s for _, s in tw])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user