mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-19 05:05:53 +08:00
feat: add jina tokenizer tool (#7375)
This commit is contained in:
parent
bfd905602f
commit
a0c689c273
43
api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
Normal file
43
api/core/tools/provider/builtin/jina/tools/jina_tokenizer.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from core.helper import ssrf_proxy
|
||||||
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||||
|
from core.tools.tool.builtin_tool import BuiltinTool
|
||||||
|
|
||||||
|
|
||||||
|
class JinaTokenizerTool(BuiltinTool):
|
||||||
|
_jina_tokenizer_endpoint = 'https://tokenize.jina.ai/'
|
||||||
|
|
||||||
|
def _invoke(
|
||||||
|
self,
|
||||||
|
user_id: str,
|
||||||
|
tool_parameters: dict[str, Any],
|
||||||
|
) -> ToolInvokeMessage:
|
||||||
|
content = tool_parameters['content']
|
||||||
|
body = {
|
||||||
|
"content": content
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
|
||||||
|
if 'api_key' in self.runtime.credentials and self.runtime.credentials.get('api_key'):
|
||||||
|
headers['Authorization'] = "Bearer " + self.runtime.credentials.get('api_key')
|
||||||
|
|
||||||
|
if tool_parameters.get('return_chunks', False):
|
||||||
|
body['return_chunks'] = True
|
||||||
|
|
||||||
|
if tool_parameters.get('return_tokens', False):
|
||||||
|
body['return_tokens'] = True
|
||||||
|
|
||||||
|
if tokenizer := tool_parameters.get('tokenizer'):
|
||||||
|
body['tokenizer'] = tokenizer
|
||||||
|
|
||||||
|
response = ssrf_proxy.post(
|
||||||
|
self._jina_tokenizer_endpoint,
|
||||||
|
headers=headers,
|
||||||
|
json=body,
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.create_json_message(response.json())
|
@ -0,0 +1,64 @@
|
|||||||
|
identity:
|
||||||
|
name: jina_tokenizer
|
||||||
|
author: hjlarry
|
||||||
|
label:
|
||||||
|
en_US: JinaTokenizer
|
||||||
|
description:
|
||||||
|
human:
|
||||||
|
en_US: Free API to tokenize text and segment long text into chunks.
|
||||||
|
zh_Hans: 免费的API可以将文本tokenize,也可以将长文本分割成多个部分。
|
||||||
|
llm: Free API to tokenize text and segment long text into chunks.
|
||||||
|
parameters:
|
||||||
|
- name: content
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
label:
|
||||||
|
en_US: Content
|
||||||
|
zh_Hans: 内容
|
||||||
|
llm_description: the content which need to tokenize or segment
|
||||||
|
form: llm
|
||||||
|
- name: return_tokens
|
||||||
|
type: boolean
|
||||||
|
required: false
|
||||||
|
label:
|
||||||
|
en_US: Return the tokens
|
||||||
|
zh_Hans: 是否返回tokens
|
||||||
|
human_description:
|
||||||
|
en_US: Return the tokens and their corresponding ids in the response.
|
||||||
|
zh_Hans: 返回tokens及其对应的ids。
|
||||||
|
form: form
|
||||||
|
- name: return_chunks
|
||||||
|
type: boolean
|
||||||
|
label:
|
||||||
|
en_US: Return the chunks
|
||||||
|
zh_Hans: 是否分块
|
||||||
|
human_description:
|
||||||
|
en_US: Chunking the input into semantically meaningful segments while handling a wide variety of text types and edge cases based on common structural cues.
|
||||||
|
zh_Hans: 将输入分块为具有语义意义的片段,同时根据常见的结构线索处理各种文本类型和边缘情况。
|
||||||
|
form: form
|
||||||
|
- name: tokenizer
|
||||||
|
type: select
|
||||||
|
options:
|
||||||
|
- value: cl100k_base
|
||||||
|
label:
|
||||||
|
en_US: cl100k_base
|
||||||
|
- value: o200k_base
|
||||||
|
label:
|
||||||
|
en_US: o200k_base
|
||||||
|
- value: p50k_base
|
||||||
|
label:
|
||||||
|
en_US: p50k_base
|
||||||
|
- value: r50k_base
|
||||||
|
label:
|
||||||
|
en_US: r50k_base
|
||||||
|
- value: p50k_edit
|
||||||
|
label:
|
||||||
|
en_US: p50k_edit
|
||||||
|
- value: gpt2
|
||||||
|
label:
|
||||||
|
en_US: gpt2
|
||||||
|
label:
|
||||||
|
en_US: Tokenizer
|
||||||
|
human_description:
|
||||||
|
en_US: cl100k_base - gpt-4,gpt-3.5-turbo,gpt-3.5; o200k_base - gpt-4o,gpt-4o-mini; p50k_base - text-davinci-003,text-davinci-002
|
||||||
|
form: form
|
Loading…
x
Reference in New Issue
Block a user