diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 71ca2aca1..0fe0653fc 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -524,7 +524,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): if tnum < 8: pos = "" # Ensure that the length of the merged chunk does not exceed chunk_token_num - if tk_nums[-1] > chunk_token_num: + if cks[-1] == "" or tk_nums[-1] > chunk_token_num: if t.find(pos) < 0: t += pos @@ -560,7 +560,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 if tnum < 8: pos = "" # Ensure that the length of the merged chunk does not exceed chunk_token_num - if tk_nums[-1] > chunk_token_num: + if cks[-1] == "" or tk_nums[-1] > chunk_token_num: if t.find(pos) < 0: t += pos cks.append(t) @@ -627,7 +627,7 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): tnum = num_tokens_from_string(t) if tnum < 8: pos = "" - if tk_nums[-1] > chunk_token_num: + if cks[-1] == "" or tk_nums[-1] > chunk_token_num: if t.find(pos) < 0: t += pos cks.append(t)