mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 15:59:04 +08:00
be better chunks before graphrag (#1811)
### What problem does this PR solve? #1594 ### Type of change - [x] Refactoring
This commit is contained in:
parent
9542f4484c
commit
fe797bcc66
@ -273,14 +273,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
|
||||
if kwargs.get("section_only", False):
|
||||
return [t for t, _ in sections]
|
||||
|
||||
st = timer()
|
||||
chunks = naive_merge(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
if kwargs.get("section_only", False):
|
||||
return chunks
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||
|
Loading…
x
Reference in New Issue
Block a user