From db4371c74598894e2cf72e5d102fe92a731d0e2d Mon Sep 17 00:00:00 2001 From: Stephen Hu Date: Fri, 23 May 2025 14:30:19 +0800 Subject: [PATCH] Fix: Improve First Chunk Size (#7806) ### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/7790 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/nlp/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 71ca2aca1..0fe0653fc 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -524,7 +524,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): if tnum < 8: pos = "" # Ensure that the length of the merged chunk does not exceed chunk_token_num - if tk_nums[-1] > chunk_token_num: + if cks[-1] == "" or tk_nums[-1] > chunk_token_num: if t.find(pos) < 0: t += pos @@ -560,7 +560,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 if tnum < 8: pos = "" # Ensure that the length of the merged chunk does not exceed chunk_token_num - if tk_nums[-1] > chunk_token_num: + if cks[-1] == "" or tk_nums[-1] > chunk_token_num: if t.find(pos) < 0: t += pos cks.append(t) @@ -627,7 +627,7 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): tnum = num_tokens_from_string(t) if tnum < 8: pos = "" - if tk_nums[-1] > chunk_token_num: + if cks[-1] == "" or tk_nums[-1] > chunk_token_num: if t.find(pos) < 0: t += pos cks.append(t)