From 5d15aca85f67e14487d78e4f69a4ab6b532545dc Mon Sep 17 00:00:00 2001 From: Bowen Liang Date: Tue, 4 Jun 2024 02:54:09 +0800 Subject: [PATCH] chore: remove unused code and class in text splitter (#4864) --- api/core/splitter/text_splitter.py | 369 ----------------------------- 1 file changed, 369 deletions(-) diff --git a/api/core/splitter/text_splitter.py b/api/core/splitter/text_splitter.py index 09f6ceb905..b3adcedc76 100644 --- a/api/core/splitter/text_splitter.py +++ b/api/core/splitter/text_splitter.py @@ -6,7 +6,6 @@ import re from abc import ABC, abstractmethod from collections.abc import Callable, Collection, Iterable, Sequence, Set from dataclasses import dataclass -from enum import Enum from typing import ( Any, Literal, @@ -477,27 +476,6 @@ class TokenTextSplitter(TextSplitter): return split_text_on_tokens(text=text, tokenizer=tokenizer) -class Language(str, Enum): - """Enum of the programming languages.""" - - CPP = "cpp" - GO = "go" - JAVA = "java" - JS = "js" - PHP = "php" - PROTO = "proto" - PYTHON = "python" - RST = "rst" - RUBY = "ruby" - RUST = "rust" - SCALA = "scala" - SWIFT = "swift" - MARKDOWN = "markdown" - LATEX = "latex" - HTML = "html" - SOL = "sol" - - class RecursiveCharacterTextSplitter(TextSplitter): """Splitting text by recursively look at characters. @@ -554,350 +532,3 @@ class RecursiveCharacterTextSplitter(TextSplitter): def split_text(self, text: str) -> list[str]: return self._split_text(text, self._separators) - - @classmethod - def from_language( - cls, language: Language, **kwargs: Any - ) -> RecursiveCharacterTextSplitter: - separators = cls.get_separators_for_language(language) - return cls(separators=separators, **kwargs) - - @staticmethod - def get_separators_for_language(language: Language) -> list[str]: - if language == Language.CPP: - return [ - # Split along class definitions - "\nclass ", - # Split along function definitions - "\nvoid ", - "\nint ", - "\nfloat ", - "\ndouble ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.GO: - return [ - # Split along function definitions - "\nfunc ", - "\nvar ", - "\nconst ", - "\ntype ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.JAVA: - return [ - # Split along class definitions - "\nclass ", - # Split along method definitions - "\npublic ", - "\nprotected ", - "\nprivate ", - "\nstatic ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.JS: - return [ - # Split along function definitions - "\nfunction ", - "\nconst ", - "\nlet ", - "\nvar ", - "\nclass ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nswitch ", - "\ncase ", - "\ndefault ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.PHP: - return [ - # Split along function definitions - "\nfunction ", - # Split along class definitions - "\nclass ", - # Split along control flow statements - "\nif ", - "\nforeach ", - "\nwhile ", - "\ndo ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.PROTO: - return [ - # Split along message definitions - "\nmessage ", - # Split along service definitions - "\nservice ", - # Split along enum definitions - "\nenum ", - # Split along option definitions - "\noption ", - # Split along import statements - "\nimport ", - # Split along syntax declarations - "\nsyntax ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.PYTHON: - return [ - # First, try to split along class definitions - "\nclass ", - "\ndef ", - "\n\tdef ", - # Now split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.RST: - return [ - # Split along section titles - "\n=+\n", - "\n-+\n", - "\n\\*+\n", - # Split along directive markers - "\n\n.. *\n\n", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.RUBY: - return [ - # Split along method definitions - "\ndef ", - "\nclass ", - # Split along control flow statements - "\nif ", - "\nunless ", - "\nwhile ", - "\nfor ", - "\ndo ", - "\nbegin ", - "\nrescue ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.RUST: - return [ - # Split along function definitions - "\nfn ", - "\nconst ", - "\nlet ", - # Split along control flow statements - "\nif ", - "\nwhile ", - "\nfor ", - "\nloop ", - "\nmatch ", - "\nconst ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.SCALA: - return [ - # Split along class definitions - "\nclass ", - "\nobject ", - # Split along method definitions - "\ndef ", - "\nval ", - "\nvar ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\nmatch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.SWIFT: - return [ - # Split along function definitions - "\nfunc ", - # Split along class definitions - "\nclass ", - "\nstruct ", - "\nenum ", - # Split along control flow statements - "\nif ", - "\nfor ", - "\nwhile ", - "\ndo ", - "\nswitch ", - "\ncase ", - # Split by the normal type of lines - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.MARKDOWN: - return [ - # First, try to split along Markdown headings (starting with level 2) - "\n#{1,6} ", - # Note the alternative syntax for headings (below) is not handled here - # Heading level 2 - # --------------- - # End of code block - "```\n", - # Horizontal lines - "\n\\*\\*\\*+\n", - "\n---+\n", - "\n___+\n", - # Note that this splitter doesn't handle horizontal lines defined - # by *three or more* of ***, ---, or ___, but this is not handled - "\n\n", - "\n", - " ", - "", - ] - elif language == Language.LATEX: - return [ - # First, try to split along Latex sections - "\n\\\\chapter{", - "\n\\\\section{", - "\n\\\\subsection{", - "\n\\\\subsubsection{", - # Now split by environments - "\n\\\begin{enumerate}", - "\n\\\begin{itemize}", - "\n\\\begin{description}", - "\n\\\begin{list}", - "\n\\\begin{quote}", - "\n\\\begin{quotation}", - "\n\\\begin{verse}", - "\n\\\begin{verbatim}", - # Now split by math environments - "\n\\\begin{align}", - "$$", - "$", - # Now split by the normal type of lines - " ", - "", - ] - elif language == Language.HTML: - return [ - # First, try to split along HTML tags - "