feat: support json file (#1217)

### What problem does this PR solve? feat: support json file. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: KevinHuSh <kevinhu.sh@gmail.com>
2025-08-12 20:39:03 +08:00 · 2024-06-21 10:42:29 +08:00 · 2024-06-21 10:42:29 +08:00 · 18f4a6b35c
commit 18f4a6b35c
parent f7cdb2678c
4 changed files with 126 additions and 1 deletions
--- a/deepdoc/parser/init.py
+++ b/deepdoc/parser/init.py
@ -16,3 +16,4 @@ from .docx_parser import RAGFlowDocxParser as DocxParser
 from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
+from .json_parser import RAGFlowJsonParser as JsonParser
--- a/deepdoc/parser/json_parser.py
+++ b/deepdoc/parser/json_parser.py
@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+# The following documents are mainly referenced, and only adaptation modifications have been made
+# from https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/json.py
+
+import json
+from typing import Any, Dict, List, Optional
+from rag.nlp import find_codec
+
+class RAGFlowJsonParser:
+    def __init__(
+        self, max_chunk_size: int = 2000, min_chunk_size: Optional[int] = None
+    ):
+        super().__init__()
+        self.max_chunk_size = max_chunk_size * 2
+        self.min_chunk_size = (
+            min_chunk_size
+            if min_chunk_size is not None
+            else max(max_chunk_size - 200, 50)
+        )
+
+    def __call__(self, binary):
+        encoding = find_codec(binary)
+        txt = binary.decode(encoding, errors="ignore")
+        json_data = json.loads(txt)
+        chunks = self.split_json(json_data, True)   
+        sections = [json.dumps(l, ensure_ascii=False) for l in chunks if l]
+        return sections
+
+    @staticmethod
+    def _json_size(data: Dict) -> int:
+        """Calculate the size of the serialized JSON object."""
+        return len(json.dumps(data, ensure_ascii=False))
+
+    @staticmethod
+    def _set_nested_dict(d: Dict, path: List[str], value: Any) -> None:
+        """Set a value in a nested dictionary based on the given path."""
+        for key in path[:-1]:
+            d = d.setdefault(key, {})
+        d[path[-1]] = value
+
+    def _list_to_dict_preprocessing(self, data: Any) -> Any:
+        if isinstance(data, dict):
+            # Process each key-value pair in the dictionary
+            return {k: self._list_to_dict_preprocessing(v) for k, v in data.items()}
+        elif isinstance(data, list):
+            # Convert the list to a dictionary with index-based keys
+            return {
+                str(i): self._list_to_dict_preprocessing(item)
+                for i, item in enumerate(data)
+            }
+        else:
+            # Base case: the item is neither a dict nor a list, so return it unchanged
+            return data
+        
+    def _json_split(
+        self,
+        data: Dict[str, Any],
+        current_path: Optional[List[str]] = None,
+        chunks: Optional[List[Dict]] = None,
+    ) -> List[Dict]:
+        """
+        Split json into maximum size dictionaries while preserving structure.
+        """
+        current_path = current_path or []
+        chunks = chunks or [{}]
+        if isinstance(data, dict):
+            for key, value in data.items():
+                new_path = current_path + [key]
+                chunk_size = self._json_size(chunks[-1])
+                size = self._json_size({key: value})
+                remaining = self.max_chunk_size - chunk_size
+
+                if size < remaining:
+                    # Add item to current chunk
+                    self._set_nested_dict(chunks[-1], new_path, value)
+                else:
+                    if chunk_size >= self.min_chunk_size:
+                        # Chunk is big enough, start a new chunk
+                        chunks.append({})
+
+                    # Iterate
+                    self._json_split(value, new_path, chunks)
+        else:
+            # handle single item
+            self._set_nested_dict(chunks[-1], current_path, data)
+        return chunks
+
+    def split_json(
+        self,
+        json_data: Dict[str, Any],
+        convert_lists: bool = False,
+    ) -> List[Dict]:
+        """Splits JSON into a list of JSON chunks"""
+
+        if convert_lists:
+            chunks = self._json_split(self._list_to_dict_preprocessing(json_data))
+        else:
+            chunks = self._json_split(json_data)
+
+        # Remove the last chunk if it's empty
+        if not chunks[-1]:
+            chunks.pop()
+        return chunks
+
+    def split_text(
+        self,
+        json_data: Dict[str, Any],
+        convert_lists: bool = False,
+        ensure_ascii: bool = True,
+    ) -> List[str]:
+        """Splits JSON into a list of JSON formatted strings"""
+
+        chunks = self.split_json(json_data=json_data, convert_lists=convert_lists)
+
+        # Convert to string
+        return [json.dumps(chunk, ensure_ascii=ensure_ascii) for chunk in chunks]
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -17,7 +17,7 @@ from timeit import default_timer as timer
 import re
 from deepdoc.parser.pdf_parser import PlainParser
 from rag.nlp import rag_tokenizer, naive_merge, tokenize_table, tokenize_chunks, find_codec
-from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser
+from deepdoc.parser import PdfParser, ExcelParser, DocxParser, HtmlParser, JsonParser
 from rag.settings import cron_logger
 from rag.utils import num_tokens_from_string

@ -167,6 +167,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        sections = [(l, "") for l in sections if l]
        callback(0.8, "Finish parsing.")

+    elif re.search(r"\.json$", filename, re.IGNORECASE):
+        callback(0.1, "Start to parse.")
+        sections = JsonParser(parser_config.get("chunk_token_num", 128))(binary)
+        sections = [(l, "") for l in sections if l]
+        callback(0.8, "Finish parsing.")
+
    elif re.search(r"\.doc$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        binary = BytesIO(binary)
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -471,7 +471,9 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
        tnum = num_tokens_from_string(t)
        if tnum < 8:
            pos = ""
+        # Ensure that the length of the merged chunk does not exceed chunk_token_num  
        if tk_nums[-1] > chunk_token_num:
+
            if t.find(pos) < 0:
                t += pos
            cks.append(t)