mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-05-29 17:45:33 +08:00

### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
31 lines
1.3 KiB
Python
31 lines
1.3 KiB
Python
import re
|
|
|
|
from graphrag.index import build_knowlege_graph_chunks
|
|
from rag.app import naive
|
|
from rag.nlp import rag_tokenizer, tokenize_chunks
|
|
|
|
|
|
def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
|
lang="Chinese", callback=None, **kwargs):
|
|
parser_config = kwargs.get(
|
|
"parser_config", {
|
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": False})
|
|
eng = lang.lower() == "english"
|
|
|
|
parser_config["layout_recognize"] = False
|
|
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
|
|
parser_config=parser_config, callback=callback)
|
|
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
|
|
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
|
|
)
|
|
for c in chunks: c["docnm_kwd"] = filename
|
|
|
|
doc = {
|
|
"docnm_kwd": filename,
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
|
"knowledge_graph_kwd": "text"
|
|
}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
chunks.extend(tokenize_chunks(sections, doc, eng))
|
|
|
|
return chunks |