mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 23:56:01 +08:00
enlarge the default token length of RAPTOR summarization (#3454)
### What problem does this PR solve? #3426 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
dc05f43eee
commit
a1d01a1b2f
@ -13,6 +13,7 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License
|
# limitations under the License
|
||||||
#
|
#
|
||||||
|
import json
|
||||||
import os.path
|
import os.path
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
@ -533,7 +534,7 @@ def parse():
|
|||||||
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
|
data=False, message='The URL format is invalid', code=settings.RetCode.ARGUMENT_ERROR)
|
||||||
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
|
||||||
os.makedirs(download_path, exist_ok=True)
|
os.makedirs(download_path, exist_ok=True)
|
||||||
from selenium.webdriver import Chrome, ChromeOptions
|
from seleniumwire.webdriver import Chrome, ChromeOptions
|
||||||
options = ChromeOptions()
|
options = ChromeOptions()
|
||||||
options.add_argument('--headless')
|
options.add_argument('--headless')
|
||||||
options.add_argument('--disable-gpu')
|
options.add_argument('--disable-gpu')
|
||||||
@ -547,10 +548,31 @@ def parse():
|
|||||||
})
|
})
|
||||||
driver = Chrome(options=options)
|
driver = Chrome(options=options)
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
print(driver.get_downloadable_files())
|
res_headers = [r.response.headers for r in driver.requests]
|
||||||
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
if len(res_headers) > 1:
|
||||||
driver.close()
|
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
|
||||||
return get_json_result(data="\n".join(sections))
|
driver.quit()
|
||||||
|
return get_json_result(data="\n".join(sections))
|
||||||
|
|
||||||
|
class File:
|
||||||
|
filename: str
|
||||||
|
filepath: str
|
||||||
|
|
||||||
|
def __init__(self, filename, filepath):
|
||||||
|
self.filename = filename
|
||||||
|
self.filepath = filepath
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
with open(self.filepath, "r") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
r = re.search(r"filename=\"([^\"])\"", json.dumps(res_headers))
|
||||||
|
if not r or r.group(1):
|
||||||
|
return get_json_result(
|
||||||
|
data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
|
||||||
|
f = File(r.group(1), os.path.join(download_path, r.group(1)))
|
||||||
|
txt = FileService.parse_docs([f], current_user.id)
|
||||||
|
return get_json_result(data=txt)
|
||||||
|
|
||||||
if 'file' not in request.files:
|
if 'file' not in request.files:
|
||||||
return get_json_result(
|
return get_json_result(
|
||||||
|
@ -68,7 +68,7 @@ class KGSearch(Dealer):
|
|||||||
|
|
||||||
ent_res = self.dataStore.search(src, list(), condition, [matchText, matchDense, fusionExpr], OrderByExpr(), 0, 32, idxnm, kb_ids)
|
ent_res = self.dataStore.search(src, list(), condition, [matchText, matchDense, fusionExpr], OrderByExpr(), 0, 32, idxnm, kb_ids)
|
||||||
ent_res_fields = self.dataStore.getFields(ent_res, src)
|
ent_res_fields = self.dataStore.getFields(ent_res, src)
|
||||||
entities = [d["name_kwd"] for d in ent_res_fields.values()]
|
entities = [d.get["name_kwd"] for d in ent_res_fields.values() if d.get("name_kwd")]
|
||||||
ent_ids = self.dataStore.getChunkIds(ent_res)
|
ent_ids = self.dataStore.getChunkIds(ent_res)
|
||||||
ent_content = merge_into_first(ent_res_fields, "-Entities-")
|
ent_content = merge_into_first(ent_res_fields, "-Entities-")
|
||||||
if ent_content:
|
if ent_content:
|
||||||
|
1374
poetry.lock
generated
1374
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -87,6 +87,7 @@ ruamel-base = "1.0.0"
|
|||||||
scholarly = "1.7.11"
|
scholarly = "1.7.11"
|
||||||
scikit-learn = "1.5.0"
|
scikit-learn = "1.5.0"
|
||||||
selenium = "4.22.0"
|
selenium = "4.22.0"
|
||||||
|
selenium-wire = "5.1.0"
|
||||||
setuptools = "^75.2.0"
|
setuptools = "^75.2.0"
|
||||||
shapely = "2.0.5"
|
shapely = "2.0.5"
|
||||||
six = "1.16.0"
|
six = "1.16.0"
|
||||||
|
@ -26,7 +26,7 @@ from rag.utils import truncate
|
|||||||
|
|
||||||
|
|
||||||
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
class RecursiveAbstractiveProcessing4TreeOrganizedRetrieval:
|
||||||
def __init__(self, max_cluster, llm_model, embd_model, prompt, max_token=256, threshold=0.1):
|
def __init__(self, max_cluster, llm_model, embd_model, prompt, max_token=512, threshold=0.1):
|
||||||
self._max_cluster = max_cluster
|
self._max_cluster = max_cluster
|
||||||
self._llm_model = llm_model
|
self._llm_model = llm_model
|
||||||
self._embd_model = embd_model
|
self._embd_model = embd_model
|
||||||
|
Loading…
x
Reference in New Issue
Block a user