mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 01:39:04 +08:00
feature. add feat to modify metadata via dataset api (#13116)
This commit is contained in:
parent
c2664e0283
commit
d73d191f99
@ -18,6 +18,7 @@ from controllers.service_api.app.error import (
|
||||
from controllers.service_api.dataset.error import (
|
||||
ArchivedDocumentImmutableError,
|
||||
DocumentIndexingError,
|
||||
InvalidMetadataError,
|
||||
)
|
||||
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
|
||||
from core.errors.error import ProviderTokenNotInitError
|
||||
@ -50,6 +51,9 @@ class DocumentAddByTextApi(DatasetApiResource):
|
||||
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
|
||||
)
|
||||
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
|
||||
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
|
||||
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
@ -61,6 +65,28 @@ class DocumentAddByTextApi(DatasetApiResource):
|
||||
if not dataset.indexing_technique and not args["indexing_technique"]:
|
||||
raise ValueError("indexing_technique is required.")
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
text = args.get("text")
|
||||
name = args.get("name")
|
||||
if text is None or name is None:
|
||||
@ -107,6 +133,8 @@ class DocumentUpdateByTextApi(DatasetApiResource):
|
||||
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
|
||||
)
|
||||
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
|
||||
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
|
||||
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
@ -115,6 +143,29 @@ class DocumentUpdateByTextApi(DatasetApiResource):
|
||||
if not dataset:
|
||||
raise ValueError("Dataset is not exist.")
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
if args["text"]:
|
||||
text = args.get("text")
|
||||
name = args.get("name")
|
||||
@ -161,6 +212,30 @@ class DocumentAddByFileApi(DatasetApiResource):
|
||||
args["doc_form"] = "text_model"
|
||||
if "doc_language" not in args:
|
||||
args["doc_language"] = "English"
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
# get dataset info
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
@ -228,6 +303,29 @@ class DocumentUpdateByFileApi(DatasetApiResource):
|
||||
if "doc_language" not in args:
|
||||
args["doc_language"] = "English"
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
# get dataset info
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
|
@ -42,6 +42,7 @@ from models.source import DataSourceOauthBinding
|
||||
from services.entities.knowledge_entities.knowledge_entities import (
|
||||
ChildChunkUpdateArgs,
|
||||
KnowledgeConfig,
|
||||
MetaDataConfig,
|
||||
RerankingModel,
|
||||
RetrievalModel,
|
||||
SegmentUpdateArgs,
|
||||
@ -894,6 +895,9 @@ class DocumentService:
|
||||
document.data_source_info = json.dumps(data_source_info)
|
||||
document.batch = batch
|
||||
document.indexing_status = "waiting"
|
||||
if knowledge_config.metadata:
|
||||
document.doc_type = knowledge_config.metadata.doc_type
|
||||
document.metadata = knowledge_config.metadata.doc_metadata
|
||||
db.session.add(document)
|
||||
documents.append(document)
|
||||
duplicate_document_ids.append(document.id)
|
||||
@ -910,6 +914,7 @@ class DocumentService:
|
||||
account,
|
||||
file_name,
|
||||
batch,
|
||||
knowledge_config.metadata,
|
||||
)
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
@ -965,6 +970,7 @@ class DocumentService:
|
||||
account,
|
||||
page.page_name,
|
||||
batch,
|
||||
knowledge_config.metadata,
|
||||
)
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
@ -1005,6 +1011,7 @@ class DocumentService:
|
||||
account,
|
||||
document_name,
|
||||
batch,
|
||||
knowledge_config.metadata,
|
||||
)
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
@ -1042,6 +1049,7 @@ class DocumentService:
|
||||
account: Account,
|
||||
name: str,
|
||||
batch: str,
|
||||
metadata: Optional[MetaDataConfig] = None,
|
||||
):
|
||||
document = Document(
|
||||
tenant_id=dataset.tenant_id,
|
||||
@ -1057,6 +1065,9 @@ class DocumentService:
|
||||
doc_form=document_form,
|
||||
doc_language=document_language,
|
||||
)
|
||||
if metadata is not None:
|
||||
document.doc_metadata = metadata.doc_metadata
|
||||
document.doc_type = metadata.doc_type
|
||||
return document
|
||||
|
||||
@staticmethod
|
||||
@ -1169,6 +1180,10 @@ class DocumentService:
|
||||
# update document name
|
||||
if document_data.name:
|
||||
document.name = document_data.name
|
||||
# update doc_type and doc_metadata if provided
|
||||
if document_data.metadata is not None:
|
||||
document.doc_metadata = document_data.metadata.doc_type
|
||||
document.doc_type = document_data.metadata.doc_type
|
||||
# update document to be waiting
|
||||
document.indexing_status = "waiting"
|
||||
document.completed_at = None
|
||||
|
@ -93,6 +93,11 @@ class RetrievalModel(BaseModel):
|
||||
score_threshold: Optional[float] = None
|
||||
|
||||
|
||||
class MetaDataConfig(BaseModel):
|
||||
doc_type: str
|
||||
doc_metadata: dict
|
||||
|
||||
|
||||
class KnowledgeConfig(BaseModel):
|
||||
original_document_id: Optional[str] = None
|
||||
duplicate: bool = True
|
||||
@ -105,6 +110,7 @@ class KnowledgeConfig(BaseModel):
|
||||
embedding_model: Optional[str] = None
|
||||
embedding_model_provider: Optional[str] = None
|
||||
name: Optional[str] = None
|
||||
metadata: Optional[MetaDataConfig] = None
|
||||
|
||||
|
||||
class SegmentUpdateArgs(BaseModel):
|
||||
|
@ -47,6 +47,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='text' type='string' key='text'>
|
||||
Document content
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
Type of document (optional):
|
||||
- <code>book</code> Book
|
||||
- <code>web_page</code> Web page
|
||||
- <code>paper</code> Academic paper/article
|
||||
- <code>social_media_post</code> Social media post
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
- <code>personal_document</code> Personal document
|
||||
- <code>business_document</code> Business document
|
||||
- <code>im_chat_log</code> Chat log
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
- <code>others</code> Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
Document metadata (required if doc_type is provided). Fields vary by doc_type:
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
- <code>language</code> Book language
|
||||
- <code>author</code> Book author
|
||||
- <code>publisher</code> Publisher name
|
||||
- <code>publication_date</code> Publication date
|
||||
- <code>isbn</code> ISBN number
|
||||
- <code>category</code> Book category
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
- <code>url</code> Page URL
|
||||
- <code>language</code> Page language
|
||||
- <code>publish_date</code> Publish date
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
- <code>description</code> Page description
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
</Property>
|
||||
<Property name='indexing_technique' type='string' key='indexing_technique'>
|
||||
Index mode
|
||||
- <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
|
||||
@ -195,6 +233,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>hierarchical_model</code> Parent-child mode
|
||||
- <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
|
||||
|
||||
- <code>doc_type</code> Type of document (optional)
|
||||
- <code>book</code> Book
|
||||
Document records a book or publication
|
||||
- <code>web_page</code> Web page
|
||||
Document records web page content
|
||||
- <code>paper</code> Academic paper/article
|
||||
Document records academic paper or research article
|
||||
- <code>social_media_post</code> Social media post
|
||||
Content from social media posts
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
Content from Wikipedia entries
|
||||
- <code>personal_document</code> Personal document
|
||||
Documents related to personal content
|
||||
- <code>business_document</code> Business document
|
||||
Documents related to business content
|
||||
- <code>im_chat_log</code> Chat log
|
||||
Records of instant messaging chats
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
Documents synchronized from Notion
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
Documents synchronized from GitHub
|
||||
- <code>others</code> Other document types
|
||||
Other document types not listed above
|
||||
|
||||
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
|
||||
Fields vary by doc_type:
|
||||
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
Title of the book
|
||||
- <code>language</code> Book language
|
||||
Language of the book
|
||||
- <code>author</code> Book author
|
||||
Author of the book
|
||||
- <code>publisher</code> Publisher name
|
||||
Name of the publishing house
|
||||
- <code>publication_date</code> Publication date
|
||||
Date when the book was published
|
||||
- <code>isbn</code> ISBN number
|
||||
International Standard Book Number
|
||||
- <code>category</code> Book category
|
||||
Category or genre of the book
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
Title of the web page
|
||||
- <code>url</code> Page URL
|
||||
URL address of the web page
|
||||
- <code>language</code> Page language
|
||||
Language of the web page
|
||||
- <code>publish_date</code> Publish date
|
||||
Date when the web page was published
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
Author or publisher of the web page
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
Topics or keywords of the web page
|
||||
- <code>description</code> Page description
|
||||
Description of the web page content
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
|
||||
- <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
|
||||
|
||||
- <code>process_rule</code> Processing rules
|
||||
@ -307,6 +407,44 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='description' type='string' key='description'>
|
||||
Knowledge description (optional)
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
Type of document (optional):
|
||||
- <code>book</code> Book
|
||||
- <code>web_page</code> Web page
|
||||
- <code>paper</code> Academic paper/article
|
||||
- <code>social_media_post</code> Social media post
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
- <code>personal_document</code> Personal document
|
||||
- <code>business_document</code> Business document
|
||||
- <code>im_chat_log</code> Chat log
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
- <code>others</code> Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
Document metadata (required if doc_type is provided). Fields vary by doc_type:
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
- <code>language</code> Book language
|
||||
- <code>author</code> Book author
|
||||
- <code>publisher</code> Publisher name
|
||||
- <code>publication_date</code> Publication date
|
||||
- <code>isbn</code> ISBN number
|
||||
- <code>category</code> Book category
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
- <code>url</code> Page URL
|
||||
- <code>language</code> Page language
|
||||
- <code>publish_date</code> Publish date
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
- <code>description</code> Page description
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
</Property>
|
||||
<Property name='indexing_technique' type='string' key='indexing_technique'>
|
||||
Index technique (optional)
|
||||
- <code>high_quality</code> High quality
|
||||
@ -624,6 +762,67 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
|
||||
- <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
|
||||
- <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
|
||||
- <code>doc_type</code> Type of document (optional)
|
||||
- <code>book</code> Book
|
||||
Document records a book or publication
|
||||
- <code>web_page</code> Web page
|
||||
Document records web page content
|
||||
- <code>paper</code> Academic paper/article
|
||||
Document records academic paper or research article
|
||||
- <code>social_media_post</code> Social media post
|
||||
Content from social media posts
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
Content from Wikipedia entries
|
||||
- <code>personal_document</code> Personal document
|
||||
Documents related to personal content
|
||||
- <code>business_document</code> Business document
|
||||
Documents related to business content
|
||||
- <code>im_chat_log</code> Chat log
|
||||
Records of instant messaging chats
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
Documents synchronized from Notion
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
Documents synchronized from GitHub
|
||||
- <code>others</code> Other document types
|
||||
Other document types not listed above
|
||||
|
||||
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
|
||||
Fields vary by doc_type:
|
||||
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
Title of the book
|
||||
- <code>language</code> Book language
|
||||
Language of the book
|
||||
- <code>author</code> Book author
|
||||
Author of the book
|
||||
- <code>publisher</code> Publisher name
|
||||
Name of the publishing house
|
||||
- <code>publication_date</code> Publication date
|
||||
Date when the book was published
|
||||
- <code>isbn</code> ISBN number
|
||||
International Standard Book Number
|
||||
- <code>category</code> Book category
|
||||
Category or genre of the book
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
Title of the web page
|
||||
- <code>url</code> Page URL
|
||||
URL address of the web page
|
||||
- <code>language</code> Page language
|
||||
Language of the web page
|
||||
- <code>publish_date</code> Publish date
|
||||
Date when the web page was published
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
Author or publisher of the web page
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
Topics or keywords of the web page
|
||||
- <code>description</code> Page description
|
||||
Description of the web page content
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
</Property>
|
||||
</Properties>
|
||||
</Col>
|
||||
|
@ -47,6 +47,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='text' type='string' key='text'>
|
||||
文档内容
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
文档类型(选填)
|
||||
- <code>book</code> 图书 Book
|
||||
- <code>web_page</code> 网页 Web page
|
||||
- <code>paper</code> 学术论文/文章 Academic paper/article
|
||||
- <code>social_media_post</code> 社交媒体帖子 Social media post
|
||||
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
|
||||
- <code>personal_document</code> 个人文档 Personal document
|
||||
- <code>business_document</code> 商业文档 Business document
|
||||
- <code>im_chat_log</code> 即时通讯记录 Chat log
|
||||
- <code>synced_from_notion</code> Notion同步文档 Notion document
|
||||
- <code>synced_from_github</code> GitHub同步文档 GitHub document
|
||||
- <code>others</code> 其他文档类型 Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
|
||||
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
|
||||
|
||||
针对图书 For <code>book</code>:
|
||||
- <code>title</code> 书名 Book title
|
||||
- <code>language</code> 图书语言 Book language
|
||||
- <code>author</code> 作者 Book author
|
||||
- <code>publisher</code> 出版社 Publisher name
|
||||
- <code>publication_date</code> 出版日期 Publication date
|
||||
- <code>isbn</code> ISBN号码 ISBN number
|
||||
- <code>category</code> 图书分类 Book category
|
||||
|
||||
针对网页 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题 Page title
|
||||
- <code>url</code> 页面网址 Page URL
|
||||
- <code>language</code> 页面语言 Page language
|
||||
- <code>publish_date</code> 发布日期 Publish date
|
||||
- <code>author/publisher</code> 作者/发布者 Author or publisher
|
||||
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
|
||||
- <code>description</code> 页面描述 Page description
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
</Property>
|
||||
<Property name='indexing_technique' type='string' key='indexing_technique'>
|
||||
索引方式
|
||||
- <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
|
||||
@ -194,6 +234,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式
|
||||
- <code>hierarchical_model</code> parent-child 模式
|
||||
- <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
|
||||
- <code>doc_type</code> 文档类型(选填)Type of document (optional)
|
||||
- <code>book</code> 图书
|
||||
文档记录一本书籍或出版物
|
||||
- <code>web_page</code> 网页
|
||||
网页内容的文档记录
|
||||
- <code>paper</code> 学术论文/文章
|
||||
学术论文或研究文章的记录
|
||||
- <code>social_media_post</code> 社交媒体帖子
|
||||
社交媒体上的帖子内容
|
||||
- <code>wikipedia_entry</code> 维基百科条目
|
||||
维基百科的词条内容
|
||||
- <code>personal_document</code> 个人文档
|
||||
个人相关的文档记录
|
||||
- <code>business_document</code> 商业文档
|
||||
商业相关的文档记录
|
||||
- <code>im_chat_log</code> 即时通讯记录
|
||||
即时通讯的聊天记录
|
||||
- <code>synced_from_notion</code> Notion同步文档
|
||||
从Notion同步的文档内容
|
||||
- <code>synced_from_github</code> GitHub同步文档
|
||||
从GitHub同步的文档内容
|
||||
- <code>others</code> 其他文档类型
|
||||
其他未列出的文档类型
|
||||
|
||||
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
|
||||
字段因文档类型而异
|
||||
|
||||
针对图书类型 For <code>book</code>:
|
||||
- <code>title</code> 书名
|
||||
书籍的标题
|
||||
- <code>language</code> 图书语言
|
||||
书籍的语言
|
||||
- <code>author</code> 作者
|
||||
书籍的作者
|
||||
- <code>publisher</code> 出版社
|
||||
出版社的名称
|
||||
- <code>publication_date</code> 出版日期
|
||||
书籍的出版日期
|
||||
- <code>isbn</code> ISBN号码
|
||||
书籍的ISBN编号
|
||||
- <code>category</code> 图书分类
|
||||
书籍的分类类别
|
||||
|
||||
针对网页类型 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题
|
||||
网页的标题
|
||||
- <code>url</code> 页面网址
|
||||
网页的URL地址
|
||||
- <code>language</code> 页面语言
|
||||
网页的语言
|
||||
- <code>publish_date</code> 发布日期
|
||||
网页的发布日期
|
||||
- <code>author/publisher</code> 作者/发布者
|
||||
网页的作者或发布者
|
||||
- <code>topic/keywords</code> 主题/关键词
|
||||
网页的主题或关键词
|
||||
- <code>description</code> 页面描述
|
||||
网页的描述信息
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
|
||||
- <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code>
|
||||
|
||||
@ -504,6 +606,46 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='text' type='string' key='text'>
|
||||
文档内容(选填)
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
文档类型(选填)
|
||||
- <code>book</code> 图书 Book
|
||||
- <code>web_page</code> 网页 Web page
|
||||
- <code>paper</code> 学术论文/文章 Academic paper/article
|
||||
- <code>social_media_post</code> 社交媒体帖子 Social media post
|
||||
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
|
||||
- <code>personal_document</code> 个人文档 Personal document
|
||||
- <code>business_document</code> 商业文档 Business document
|
||||
- <code>im_chat_log</code> 即时通讯记录 Chat log
|
||||
- <code>synced_from_notion</code> Notion同步文档 Notion document
|
||||
- <code>synced_from_github</code> GitHub同步文档 GitHub document
|
||||
- <code>others</code> 其他文档类型 Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
|
||||
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
|
||||
|
||||
针对图书 For <code>book</code>:
|
||||
- <code>title</code> 书名 Book title
|
||||
- <code>language</code> 图书语言 Book language
|
||||
- <code>author</code> 作者 Book author
|
||||
- <code>publisher</code> 出版社 Publisher name
|
||||
- <code>publication_date</code> 出版日期 Publication date
|
||||
- <code>isbn</code> ISBN号码 ISBN number
|
||||
- <code>category</code> 图书分类 Book category
|
||||
|
||||
针对网页 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题 Page title
|
||||
- <code>url</code> 页面网址 Page URL
|
||||
- <code>language</code> 页面语言 Page language
|
||||
- <code>publish_date</code> 发布日期 Publish date
|
||||
- <code>author/publisher</code> 作者/发布者 Author or publisher
|
||||
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
|
||||
- <code>description</code> 页面描述 Page description
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
</Property>
|
||||
<Property name='process_rule' type='object' key='process_rule'>
|
||||
处理规则(选填)
|
||||
- <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义
|
||||
@ -624,6 +766,68 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code>
|
||||
- <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
|
||||
- <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
|
||||
- <code>doc_type</code> 文档类型(选填)Type of document (optional)
|
||||
- <code>book</code> 图书
|
||||
文档记录一本书籍或出版物
|
||||
- <code>web_page</code> 网页
|
||||
网页内容的文档记录
|
||||
- <code>paper</code> 学术论文/文章
|
||||
学术论文或研究文章的记录
|
||||
- <code>social_media_post</code> 社交媒体帖子
|
||||
社交媒体上的帖子内容
|
||||
- <code>wikipedia_entry</code> 维基百科条目
|
||||
维基百科的词条内容
|
||||
- <code>personal_document</code> 个人文档
|
||||
个人相关的文档记录
|
||||
- <code>business_document</code> 商业文档
|
||||
商业相关的文档记录
|
||||
- <code>im_chat_log</code> 即时通讯记录
|
||||
即时通讯的聊天记录
|
||||
- <code>synced_from_notion</code> Notion同步文档
|
||||
从Notion同步的文档内容
|
||||
- <code>synced_from_github</code> GitHub同步文档
|
||||
从GitHub同步的文档内容
|
||||
- <code>others</code> 其他文档类型
|
||||
其他未列出的文档类型
|
||||
|
||||
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
|
||||
字段因文档类型而异
|
||||
|
||||
针对图书类型 For <code>book</code>:
|
||||
- <code>title</code> 书名
|
||||
书籍的标题
|
||||
- <code>language</code> 图书语言
|
||||
书籍的语言
|
||||
- <code>author</code> 作者
|
||||
书籍的作者
|
||||
- <code>publisher</code> 出版社
|
||||
出版社的名称
|
||||
- <code>publication_date</code> 出版日期
|
||||
书籍的出版日期
|
||||
- <code>isbn</code> ISBN号码
|
||||
书籍的ISBN编号
|
||||
- <code>category</code> 图书分类
|
||||
书籍的分类类别
|
||||
|
||||
针对网页类型 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题
|
||||
网页的标题
|
||||
- <code>url</code> 页面网址
|
||||
网页的URL地址
|
||||
- <code>language</code> 页面语言
|
||||
网页的语言
|
||||
- <code>publish_date</code> 发布日期
|
||||
网页的发布日期
|
||||
- <code>author/publisher</code> 作者/发布者
|
||||
网页的作者或发布者
|
||||
- <code>topic/keywords</code> 主题/关键词
|
||||
网页的主题或关键词
|
||||
- <code>description</code> 页面描述
|
||||
网页的描述信息
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
</Property>
|
||||
</Properties>
|
||||
</Col>
|
||||
|
Loading…
x
Reference in New Issue
Block a user