mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 20:39:01 +08:00
chore: remove unused code and class in text splitter (#4864)
This commit is contained in:
parent
b98a1a3303
commit
5d15aca85f
@ -6,7 +6,6 @@ import re
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Callable, Collection, Iterable, Sequence, Set
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import (
|
||||
Any,
|
||||
Literal,
|
||||
@ -477,27 +476,6 @@ class TokenTextSplitter(TextSplitter):
|
||||
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||
|
||||
|
||||
class Language(str, Enum):
|
||||
"""Enum of the programming languages."""
|
||||
|
||||
CPP = "cpp"
|
||||
GO = "go"
|
||||
JAVA = "java"
|
||||
JS = "js"
|
||||
PHP = "php"
|
||||
PROTO = "proto"
|
||||
PYTHON = "python"
|
||||
RST = "rst"
|
||||
RUBY = "ruby"
|
||||
RUST = "rust"
|
||||
SCALA = "scala"
|
||||
SWIFT = "swift"
|
||||
MARKDOWN = "markdown"
|
||||
LATEX = "latex"
|
||||
HTML = "html"
|
||||
SOL = "sol"
|
||||
|
||||
|
||||
class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
"""Splitting text by recursively look at characters.
|
||||
|
||||
@ -554,350 +532,3 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
||||
|
||||
def split_text(self, text: str) -> list[str]:
|
||||
return self._split_text(text, self._separators)
|
||||
|
||||
@classmethod
|
||||
def from_language(
|
||||
cls, language: Language, **kwargs: Any
|
||||
) -> RecursiveCharacterTextSplitter:
|
||||
separators = cls.get_separators_for_language(language)
|
||||
return cls(separators=separators, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def get_separators_for_language(language: Language) -> list[str]:
|
||||
if language == Language.CPP:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along function definitions
|
||||
"\nvoid ",
|
||||
"\nint ",
|
||||
"\nfloat ",
|
||||
"\ndouble ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.GO:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunc ",
|
||||
"\nvar ",
|
||||
"\nconst ",
|
||||
"\ntype ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.JAVA:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along method definitions
|
||||
"\npublic ",
|
||||
"\nprotected ",
|
||||
"\nprivate ",
|
||||
"\nstatic ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.JS:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunction ",
|
||||
"\nconst ",
|
||||
"\nlet ",
|
||||
"\nvar ",
|
||||
"\nclass ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
"\ndefault ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.PHP:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunction ",
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nforeach ",
|
||||
"\nwhile ",
|
||||
"\ndo ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.PROTO:
|
||||
return [
|
||||
# Split along message definitions
|
||||
"\nmessage ",
|
||||
# Split along service definitions
|
||||
"\nservice ",
|
||||
# Split along enum definitions
|
||||
"\nenum ",
|
||||
# Split along option definitions
|
||||
"\noption ",
|
||||
# Split along import statements
|
||||
"\nimport ",
|
||||
# Split along syntax declarations
|
||||
"\nsyntax ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.PYTHON:
|
||||
return [
|
||||
# First, try to split along class definitions
|
||||
"\nclass ",
|
||||
"\ndef ",
|
||||
"\n\tdef ",
|
||||
# Now split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RST:
|
||||
return [
|
||||
# Split along section titles
|
||||
"\n=+\n",
|
||||
"\n-+\n",
|
||||
"\n\\*+\n",
|
||||
# Split along directive markers
|
||||
"\n\n.. *\n\n",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RUBY:
|
||||
return [
|
||||
# Split along method definitions
|
||||
"\ndef ",
|
||||
"\nclass ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nunless ",
|
||||
"\nwhile ",
|
||||
"\nfor ",
|
||||
"\ndo ",
|
||||
"\nbegin ",
|
||||
"\nrescue ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.RUST:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfn ",
|
||||
"\nconst ",
|
||||
"\nlet ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nwhile ",
|
||||
"\nfor ",
|
||||
"\nloop ",
|
||||
"\nmatch ",
|
||||
"\nconst ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.SCALA:
|
||||
return [
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
"\nobject ",
|
||||
# Split along method definitions
|
||||
"\ndef ",
|
||||
"\nval ",
|
||||
"\nvar ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\nmatch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.SWIFT:
|
||||
return [
|
||||
# Split along function definitions
|
||||
"\nfunc ",
|
||||
# Split along class definitions
|
||||
"\nclass ",
|
||||
"\nstruct ",
|
||||
"\nenum ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\ndo ",
|
||||
"\nswitch ",
|
||||
"\ncase ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.MARKDOWN:
|
||||
return [
|
||||
# First, try to split along Markdown headings (starting with level 2)
|
||||
"\n#{1,6} ",
|
||||
# Note the alternative syntax for headings (below) is not handled here
|
||||
# Heading level 2
|
||||
# ---------------
|
||||
# End of code block
|
||||
"```\n",
|
||||
# Horizontal lines
|
||||
"\n\\*\\*\\*+\n",
|
||||
"\n---+\n",
|
||||
"\n___+\n",
|
||||
# Note that this splitter doesn't handle horizontal lines defined
|
||||
# by *three or more* of ***, ---, or ___, but this is not handled
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.LATEX:
|
||||
return [
|
||||
# First, try to split along Latex sections
|
||||
"\n\\\\chapter{",
|
||||
"\n\\\\section{",
|
||||
"\n\\\\subsection{",
|
||||
"\n\\\\subsubsection{",
|
||||
# Now split by environments
|
||||
"\n\\\begin{enumerate}",
|
||||
"\n\\\begin{itemize}",
|
||||
"\n\\\begin{description}",
|
||||
"\n\\\begin{list}",
|
||||
"\n\\\begin{quote}",
|
||||
"\n\\\begin{quotation}",
|
||||
"\n\\\begin{verse}",
|
||||
"\n\\\begin{verbatim}",
|
||||
# Now split by math environments
|
||||
"\n\\\begin{align}",
|
||||
"$$",
|
||||
"$",
|
||||
# Now split by the normal type of lines
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
elif language == Language.HTML:
|
||||
return [
|
||||
# First, try to split along HTML tags
|
||||
"<body",
|
||||
"<div",
|
||||
"<p",
|
||||
"<br",
|
||||
"<li",
|
||||
"<h1",
|
||||
"<h2",
|
||||
"<h3",
|
||||
"<h4",
|
||||
"<h5",
|
||||
"<h6",
|
||||
"<span",
|
||||
"<table",
|
||||
"<tr",
|
||||
"<td",
|
||||
"<th",
|
||||
"<ul",
|
||||
"<ol",
|
||||
"<header",
|
||||
"<footer",
|
||||
"<nav",
|
||||
# Head
|
||||
"<head",
|
||||
"<style",
|
||||
"<script",
|
||||
"<meta",
|
||||
"<title",
|
||||
"",
|
||||
]
|
||||
elif language == Language.SOL:
|
||||
return [
|
||||
# Split along compiler information definitions
|
||||
"\npragma ",
|
||||
"\nusing ",
|
||||
# Split along contract definitions
|
||||
"\ncontract ",
|
||||
"\ninterface ",
|
||||
"\nlibrary ",
|
||||
# Split along method definitions
|
||||
"\nconstructor ",
|
||||
"\ntype ",
|
||||
"\nfunction ",
|
||||
"\nevent ",
|
||||
"\nmodifier ",
|
||||
"\nerror ",
|
||||
"\nstruct ",
|
||||
"\nenum ",
|
||||
# Split along control flow statements
|
||||
"\nif ",
|
||||
"\nfor ",
|
||||
"\nwhile ",
|
||||
"\ndo while ",
|
||||
"\nassembly ",
|
||||
# Split by the normal type of lines
|
||||
"\n\n",
|
||||
"\n",
|
||||
" ",
|
||||
"",
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Language {language} is not supported! "
|
||||
f"Please choose from {list(Language)}"
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user