be better chunks before graphrag (#1811)

### What problem does this PR solve? #1594 ### Type of change - [x] Refactoring
2025-08-13 02:19:10 +08:00 · 2024-08-05 16:21:52 +08:00 · 2024-08-05 16:21:52 +08:00 · fe797bcc66
commit fe797bcc66
parent 9542f4484c
1 changed files with 2 additions and 3 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -273,14 +273,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        raise NotImplementedError(
            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
    if kwargs.get("section_only", False):
        return [t for t, _ in sections]
    st = timer()
    chunks = naive_merge(
        sections, int(parser_config.get(
            "chunk_token_num", 128)), parser_config.get(
            "delimiter", "\n!?。；！？"))
    if kwargs.get("section_only", False):
        return chunks
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
    cron_logger.info("naive_merge({}): {}".format(filename, timer() - st))