mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 16:59:04 +08:00
chore: remove unused code and class in text splitter (#4864)
This commit is contained in:
parent
b98a1a3303
commit
5d15aca85f
@ -6,7 +6,6 @@ import re
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from collections.abc import Callable, Collection, Iterable, Sequence, Set
|
from collections.abc import Callable, Collection, Iterable, Sequence, Set
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum
|
|
||||||
from typing import (
|
from typing import (
|
||||||
Any,
|
Any,
|
||||||
Literal,
|
Literal,
|
||||||
@ -477,27 +476,6 @@ class TokenTextSplitter(TextSplitter):
|
|||||||
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
||||||
|
|
||||||
|
|
||||||
class Language(str, Enum):
|
|
||||||
"""Enum of the programming languages."""
|
|
||||||
|
|
||||||
CPP = "cpp"
|
|
||||||
GO = "go"
|
|
||||||
JAVA = "java"
|
|
||||||
JS = "js"
|
|
||||||
PHP = "php"
|
|
||||||
PROTO = "proto"
|
|
||||||
PYTHON = "python"
|
|
||||||
RST = "rst"
|
|
||||||
RUBY = "ruby"
|
|
||||||
RUST = "rust"
|
|
||||||
SCALA = "scala"
|
|
||||||
SWIFT = "swift"
|
|
||||||
MARKDOWN = "markdown"
|
|
||||||
LATEX = "latex"
|
|
||||||
HTML = "html"
|
|
||||||
SOL = "sol"
|
|
||||||
|
|
||||||
|
|
||||||
class RecursiveCharacterTextSplitter(TextSplitter):
|
class RecursiveCharacterTextSplitter(TextSplitter):
|
||||||
"""Splitting text by recursively look at characters.
|
"""Splitting text by recursively look at characters.
|
||||||
|
|
||||||
@ -554,350 +532,3 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|||||||
|
|
||||||
def split_text(self, text: str) -> list[str]:
|
def split_text(self, text: str) -> list[str]:
|
||||||
return self._split_text(text, self._separators)
|
return self._split_text(text, self._separators)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_language(
|
|
||||||
cls, language: Language, **kwargs: Any
|
|
||||||
) -> RecursiveCharacterTextSplitter:
|
|
||||||
separators = cls.get_separators_for_language(language)
|
|
||||||
return cls(separators=separators, **kwargs)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_separators_for_language(language: Language) -> list[str]:
|
|
||||||
if language == Language.CPP:
|
|
||||||
return [
|
|
||||||
# Split along class definitions
|
|
||||||
"\nclass ",
|
|
||||||
# Split along function definitions
|
|
||||||
"\nvoid ",
|
|
||||||
"\nint ",
|
|
||||||
"\nfloat ",
|
|
||||||
"\ndouble ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\nswitch ",
|
|
||||||
"\ncase ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.GO:
|
|
||||||
return [
|
|
||||||
# Split along function definitions
|
|
||||||
"\nfunc ",
|
|
||||||
"\nvar ",
|
|
||||||
"\nconst ",
|
|
||||||
"\ntype ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nswitch ",
|
|
||||||
"\ncase ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.JAVA:
|
|
||||||
return [
|
|
||||||
# Split along class definitions
|
|
||||||
"\nclass ",
|
|
||||||
# Split along method definitions
|
|
||||||
"\npublic ",
|
|
||||||
"\nprotected ",
|
|
||||||
"\nprivate ",
|
|
||||||
"\nstatic ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\nswitch ",
|
|
||||||
"\ncase ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.JS:
|
|
||||||
return [
|
|
||||||
# Split along function definitions
|
|
||||||
"\nfunction ",
|
|
||||||
"\nconst ",
|
|
||||||
"\nlet ",
|
|
||||||
"\nvar ",
|
|
||||||
"\nclass ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\nswitch ",
|
|
||||||
"\ncase ",
|
|
||||||
"\ndefault ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.PHP:
|
|
||||||
return [
|
|
||||||
# Split along function definitions
|
|
||||||
"\nfunction ",
|
|
||||||
# Split along class definitions
|
|
||||||
"\nclass ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nforeach ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\ndo ",
|
|
||||||
"\nswitch ",
|
|
||||||
"\ncase ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.PROTO:
|
|
||||||
return [
|
|
||||||
# Split along message definitions
|
|
||||||
"\nmessage ",
|
|
||||||
# Split along service definitions
|
|
||||||
"\nservice ",
|
|
||||||
# Split along enum definitions
|
|
||||||
"\nenum ",
|
|
||||||
# Split along option definitions
|
|
||||||
"\noption ",
|
|
||||||
# Split along import statements
|
|
||||||
"\nimport ",
|
|
||||||
# Split along syntax declarations
|
|
||||||
"\nsyntax ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.PYTHON:
|
|
||||||
return [
|
|
||||||
# First, try to split along class definitions
|
|
||||||
"\nclass ",
|
|
||||||
"\ndef ",
|
|
||||||
"\n\tdef ",
|
|
||||||
# Now split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.RST:
|
|
||||||
return [
|
|
||||||
# Split along section titles
|
|
||||||
"\n=+\n",
|
|
||||||
"\n-+\n",
|
|
||||||
"\n\\*+\n",
|
|
||||||
# Split along directive markers
|
|
||||||
"\n\n.. *\n\n",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.RUBY:
|
|
||||||
return [
|
|
||||||
# Split along method definitions
|
|
||||||
"\ndef ",
|
|
||||||
"\nclass ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nunless ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\nfor ",
|
|
||||||
"\ndo ",
|
|
||||||
"\nbegin ",
|
|
||||||
"\nrescue ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.RUST:
|
|
||||||
return [
|
|
||||||
# Split along function definitions
|
|
||||||
"\nfn ",
|
|
||||||
"\nconst ",
|
|
||||||
"\nlet ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nloop ",
|
|
||||||
"\nmatch ",
|
|
||||||
"\nconst ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.SCALA:
|
|
||||||
return [
|
|
||||||
# Split along class definitions
|
|
||||||
"\nclass ",
|
|
||||||
"\nobject ",
|
|
||||||
# Split along method definitions
|
|
||||||
"\ndef ",
|
|
||||||
"\nval ",
|
|
||||||
"\nvar ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\nmatch ",
|
|
||||||
"\ncase ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.SWIFT:
|
|
||||||
return [
|
|
||||||
# Split along function definitions
|
|
||||||
"\nfunc ",
|
|
||||||
# Split along class definitions
|
|
||||||
"\nclass ",
|
|
||||||
"\nstruct ",
|
|
||||||
"\nenum ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\ndo ",
|
|
||||||
"\nswitch ",
|
|
||||||
"\ncase ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.MARKDOWN:
|
|
||||||
return [
|
|
||||||
# First, try to split along Markdown headings (starting with level 2)
|
|
||||||
"\n#{1,6} ",
|
|
||||||
# Note the alternative syntax for headings (below) is not handled here
|
|
||||||
# Heading level 2
|
|
||||||
# ---------------
|
|
||||||
# End of code block
|
|
||||||
"```\n",
|
|
||||||
# Horizontal lines
|
|
||||||
"\n\\*\\*\\*+\n",
|
|
||||||
"\n---+\n",
|
|
||||||
"\n___+\n",
|
|
||||||
# Note that this splitter doesn't handle horizontal lines defined
|
|
||||||
# by *three or more* of ***, ---, or ___, but this is not handled
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.LATEX:
|
|
||||||
return [
|
|
||||||
# First, try to split along Latex sections
|
|
||||||
"\n\\\\chapter{",
|
|
||||||
"\n\\\\section{",
|
|
||||||
"\n\\\\subsection{",
|
|
||||||
"\n\\\\subsubsection{",
|
|
||||||
# Now split by environments
|
|
||||||
"\n\\\begin{enumerate}",
|
|
||||||
"\n\\\begin{itemize}",
|
|
||||||
"\n\\\begin{description}",
|
|
||||||
"\n\\\begin{list}",
|
|
||||||
"\n\\\begin{quote}",
|
|
||||||
"\n\\\begin{quotation}",
|
|
||||||
"\n\\\begin{verse}",
|
|
||||||
"\n\\\begin{verbatim}",
|
|
||||||
# Now split by math environments
|
|
||||||
"\n\\\begin{align}",
|
|
||||||
"$$",
|
|
||||||
"$",
|
|
||||||
# Now split by the normal type of lines
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.HTML:
|
|
||||||
return [
|
|
||||||
# First, try to split along HTML tags
|
|
||||||
"<body",
|
|
||||||
"<div",
|
|
||||||
"<p",
|
|
||||||
"<br",
|
|
||||||
"<li",
|
|
||||||
"<h1",
|
|
||||||
"<h2",
|
|
||||||
"<h3",
|
|
||||||
"<h4",
|
|
||||||
"<h5",
|
|
||||||
"<h6",
|
|
||||||
"<span",
|
|
||||||
"<table",
|
|
||||||
"<tr",
|
|
||||||
"<td",
|
|
||||||
"<th",
|
|
||||||
"<ul",
|
|
||||||
"<ol",
|
|
||||||
"<header",
|
|
||||||
"<footer",
|
|
||||||
"<nav",
|
|
||||||
# Head
|
|
||||||
"<head",
|
|
||||||
"<style",
|
|
||||||
"<script",
|
|
||||||
"<meta",
|
|
||||||
"<title",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
elif language == Language.SOL:
|
|
||||||
return [
|
|
||||||
# Split along compiler information definitions
|
|
||||||
"\npragma ",
|
|
||||||
"\nusing ",
|
|
||||||
# Split along contract definitions
|
|
||||||
"\ncontract ",
|
|
||||||
"\ninterface ",
|
|
||||||
"\nlibrary ",
|
|
||||||
# Split along method definitions
|
|
||||||
"\nconstructor ",
|
|
||||||
"\ntype ",
|
|
||||||
"\nfunction ",
|
|
||||||
"\nevent ",
|
|
||||||
"\nmodifier ",
|
|
||||||
"\nerror ",
|
|
||||||
"\nstruct ",
|
|
||||||
"\nenum ",
|
|
||||||
# Split along control flow statements
|
|
||||||
"\nif ",
|
|
||||||
"\nfor ",
|
|
||||||
"\nwhile ",
|
|
||||||
"\ndo while ",
|
|
||||||
"\nassembly ",
|
|
||||||
# Split by the normal type of lines
|
|
||||||
"\n\n",
|
|
||||||
"\n",
|
|
||||||
" ",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Language {language} is not supported! "
|
|
||||||
f"Please choose from {list(Language)}"
|
|
||||||
)
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user