mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-14 14:55:54 +08:00
feat: add jina tokenizer tool (#7375)
This commit is contained in:
parent
bfd905602f
commit
a0c689c273
43
api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
Normal file
43
api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
Normal file
@ -0,0 +1,43 @@
|
||||
from typing import Any
|
||||
|
||||
from core.helper import ssrf_proxy
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class JinaTokenizerTool(BuiltinTool):
|
||||
_jina_tokenizer_endpoint = 'https://tokenize.jina.ai/'
|
||||
|
||||
def _invoke(
|
||||
self,
|
||||
user_id: str,
|
||||
tool_parameters: dict[str, Any],
|
||||
) -> ToolInvokeMessage:
|
||||
content = tool_parameters['content']
|
||||
body = {
|
||||
"content": content
|
||||
}
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
|
||||
headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
|
||||
|
||||
if tool_parameters.get('return_chunks', False):
|
||||
body['return_chunks'] = True
|
||||
|
||||
if tool_parameters.get('return_tokens', False):
|
||||
body['return_tokens'] = True
|
||||
|
||||
if tokenizer := tool_parameters.get('tokenizer'):
|
||||
body['tokenizer'] = tokenizer
|
||||
|
||||
response = ssrf_proxy.post(
|
||||
self._jina_tokenizer_endpoint,
|
||||
headers=headers,
|
||||
json=body,
|
||||
)
|
||||
|
||||
return self.create_json_message(response.json())
|
@ -0,0 +1,64 @@
|
||||
identity:
|
||||
name: jina_tokenizer
|
||||
author: hjlarry
|
||||
label:
|
||||
en_US: JinaTokenizer
|
||||
description:
|
||||
human:
|
||||
en_US: Free API to tokenize text and segment long text into chunks.
|
||||
zh_Hans: 免费的API可以将文本tokenize,也可以将长文本分割成多个部分。
|
||||
llm: Free API to tokenize text and segment long text into chunks.
|
||||
parameters:
|
||||
- name: content
|
||||
type: string
|
||||
required: true
|
||||
label:
|
||||
en_US: Content
|
||||
zh_Hans: 内容
|
||||
llm_description: the content which need to tokenize or segment
|
||||
form: llm
|
||||
- name: return_tokens
|
||||
type: boolean
|
||||
required: false
|
||||
label:
|
||||
en_US: Return the tokens
|
||||
zh_Hans: 是否返回tokens
|
||||
human_description:
|
||||
en_US: Return the tokens and their corresponding ids in the response.
|
||||
zh_Hans: 返回tokens及其对应的ids。
|
||||
form: form
|
||||
- name: return_chunks
|
||||
type: boolean
|
||||
label:
|
||||
en_US: Return the chunks
|
||||
zh_Hans: 是否分块
|
||||
human_description:
|
||||
en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues.
|
||||
zh_Hans: 将输入分块为具有语义意义的片段,同时根据常见的结构线索处理各种文本类型和边缘情况。
|
||||
form: form
|
||||
- name: tokenizer
|
||||
type: select
|
||||
options:
|
||||
- value: cl100k_base
|
||||
label:
|
||||
en_US: cl100k_base
|
||||
- value: o200k_base
|
||||
label:
|
||||
en_US: o200k_base
|
||||
- value: p50k_base
|
||||
label:
|
||||
en_US: p50k_base
|
||||
- value: r50k_base
|
||||
label:
|
||||
en_US: r50k_base
|
||||
- value: p50k_edit
|
||||
label:
|
||||
en_US: p50k_edit
|
||||
- value: gpt2
|
||||
label:
|
||||
en_US: gpt2
|
||||
label:
|
||||
en_US: Tokenizer
|
||||
human_description:
|
||||
en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5; o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002
|
||||
form: form
|
Loading…
x
Reference in New Issue
Block a user