mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 15:48:59 +08:00
be better chunks before graphrag (#1811)
### What problem does this PR solve? #1594 ### Type of change - [x] Refactoring
This commit is contained in:
parent
9542f4484c
commit
fe797bcc66
@ -273,14 +273,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||||
|
|
||||||
if kwargs.get("section_only", False):
|
|
||||||
return [t for t, _ in sections]
|
|
||||||
|
|
||||||
st = timer()
|
st = timer()
|
||||||
chunks = naive_merge(
|
chunks = naive_merge(
|
||||||
sections, int(parser_config.get(
|
sections, int(parser_config.get(
|
||||||
"chunk_token_num", 128)), parser_config.get(
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
"delimiter", "\n!?。;!?"))
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
return chunks
|
||||||
|
|
||||||
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
|
||||||
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user