mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-06-04 11:24:00 +08:00

### What problem does this PR solve? Add license statement. ### Type of change - [x] Refactoring Signed-off-by: Jin Hai <haijin.chn@gmail.com>
49 lines
1.9 KiB
Python
49 lines
1.9 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import re
|
|
|
|
from graphrag.index import build_knowledge_graph_chunks
|
|
from rag.app import naive
|
|
from rag.nlp import rag_tokenizer, tokenize_chunks
|
|
|
|
|
|
def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
|
|
lang="Chinese", callback=None, **kwargs):
|
|
parser_config = kwargs.get(
|
|
"parser_config", {
|
|
"chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
|
|
eng = lang.lower() == "english"
|
|
|
|
parser_config["layout_recognize"] = True
|
|
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True,
|
|
parser_config=parser_config, callback=callback)
|
|
chunks = build_knowledge_graph_chunks(tenant_id, sections, callback,
|
|
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
|
|
)
|
|
for c in chunks:
|
|
c["docnm_kwd"] = filename
|
|
|
|
doc = {
|
|
"docnm_kwd": filename,
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
|
"knowledge_graph_kwd": "text"
|
|
}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
chunks.extend(tokenize_chunks(sections, doc, eng))
|
|
|
|
return chunks
|