From fe797bcc6672bf6432d5bc16c9e25bd242ec05ed Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 5 Aug 2024 16:21:52 +0800 Subject: [PATCH] be better chunks before graphrag (#1811) ### What problem does this PR solve? #1594 ### Type of change - [x] Refactoring --- rag/app/naive.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index 6c39954c5..ab824bfab 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -273,14 +273,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, raise NotImplementedError( "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") - if kwargs.get("section_only", False): - return [t for t, _ in sections] - st = timer() chunks = naive_merge( sections, int(parser_config.get( "chunk_token_num", 128)), parser_config.get( "delimiter", "\n!?。;!?")) + if kwargs.get("section_only", False): + return chunks res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))