diff --git a/api/commands.py b/api/commands.py
index df67f29aff..94e7e74e36 100644
--- a/api/commands.py
+++ b/api/commands.py
@@ -20,7 +20,7 @@ from libs.helper import email as email_validate
from libs.password import hash_password, password_pattern, valid_password
from libs.rsa import generate_key_pair
from models import Tenant
-from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
+from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
from models.dataset import Document as DatasetDocument
from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
from models.provider import Provider, ProviderModel
@@ -483,14 +483,11 @@ def convert_to_agent_apps():
click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
-@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
+@click.command("add-qdrant-index", help="Add Qdrant index.")
@click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
-def add_qdrant_doc_id_index(field: str):
- click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
- vector_type = dify_config.VECTOR_STORE
- if vector_type != "qdrant":
- click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
- return
+def add_qdrant_index(field: str):
+ click.echo(click.style("Starting Qdrant index creation.", fg="green"))
+
create_count = 0
try:
@@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
+@click.command("old-metadata-migration", help="Old metadata migration.")
+def old_metadata_migration():
+ """
+ Old metadata migration.
+ """
+ click.echo(click.style("Starting old metadata migration.", fg="green"))
+
+ page = 1
+ while True:
+ try:
+ documents = (
+ DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
+ .order_by(DatasetDocument.created_at.desc())
+ .paginate(page=page, per_page=50)
+ )
+ except NotFound:
+ break
+ if not documents:
+ break
+ for document in documents:
+ if document.doc_metadata:
+ doc_metadata = document.doc_metadata
+ for key, value in doc_metadata.items():
+ dataset_metadata = (
+ db.session.query(DatasetMetadata)
+ .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
+ .first()
+ )
+ if not dataset_metadata:
+ dataset_metadata = DatasetMetadata(
+ tenant_id=document.tenant_id,
+ dataset_id=document.dataset_id,
+ name=key,
+ type="string",
+ created_by=document.created_by,
+ )
+ db.session.add(dataset_metadata)
+ db.session.flush()
+ dataset_metadata_binding = DatasetMetadataBinding(
+ tenant_id=document.tenant_id,
+ dataset_id=document.dataset_id,
+ metadata_id=dataset_metadata.id,
+ document_id=document.id,
+ created_by=document.created_by,
+ )
+ db.session.add(dataset_metadata_binding)
+ else:
+ dataset_metadata_binding = DatasetMetadataBinding.query.filter(
+ DatasetMetadataBinding.dataset_id == document.dataset_id,
+ DatasetMetadataBinding.document_id == document.id,
+ DatasetMetadataBinding.metadata_id == dataset_metadata.id,
+ ).first()
+ if not dataset_metadata_binding:
+ dataset_metadata_binding = DatasetMetadataBinding(
+ tenant_id=document.tenant_id,
+ dataset_id=document.dataset_id,
+ metadata_id=dataset_metadata.id,
+ document_id=document.id,
+ created_by=document.created_by,
+ )
+ db.session.add(dataset_metadata_binding)
+ db.session.commit()
+ page += 1
+ click.echo(click.style("Old metadata migration completed.", fg="green"))
+
+
@click.command("create-tenant", help="Create account and tenant.")
@click.option("--email", prompt=True, help="Tenant account email.")
@click.option("--name", prompt=True, help="Workspace name.")
diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py
index d4e67b6596..995444ee48 100644
--- a/api/controllers/service_api/dataset/document.py
+++ b/api/controllers/service_api/dataset/document.py
@@ -18,7 +18,6 @@ from controllers.service_api.app.error import (
from controllers.service_api.dataset.error import (
ArchivedDocumentImmutableError,
DocumentIndexingError,
- InvalidMetadataError,
)
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
from core.errors.error import ProviderTokenNotInitError
@@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource):
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
- parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
- parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
@@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource):
if not dataset.indexing_technique and not args["indexing_technique"]:
raise ValueError("indexing_technique is required.")
- # Validate metadata if provided
- if args.get("doc_type") or args.get("doc_metadata"):
- if not args.get("doc_type") or not args.get("doc_metadata"):
- raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
- if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
- raise InvalidMetadataError(
- "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
- )
-
- if not isinstance(args["doc_metadata"], dict):
- raise InvalidMetadataError("doc_metadata must be a dictionary")
-
- # Validate metadata schema based on doc_type
- if args["doc_type"] != "others":
- metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
- for key, value in args["doc_metadata"].items():
- if key in metadata_schema and not isinstance(value, metadata_schema[key]):
- raise InvalidMetadataError(f"Invalid type for metadata field {key}")
- # set to MetaDataConfig
- args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
text = args.get("text")
name = args.get("name")
if text is None or name is None:
@@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
)
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
- parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
- parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
args = parser.parse_args()
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
# indexing_technique is already set in dataset since this is an update
args["indexing_technique"] = dataset.indexing_technique
- # Validate metadata if provided
- if args.get("doc_type") or args.get("doc_metadata"):
- if not args.get("doc_type") or not args.get("doc_metadata"):
- raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
- if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
- raise InvalidMetadataError(
- "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
- )
-
- if not isinstance(args["doc_metadata"], dict):
- raise InvalidMetadataError("doc_metadata must be a dictionary")
-
- # Validate metadata schema based on doc_type
- if args["doc_type"] != "others":
- metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
- for key, value in args["doc_metadata"].items():
- if key in metadata_schema and not isinstance(value, metadata_schema[key]):
- raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
- # set to MetaDataConfig
- args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
if args["text"]:
text = args.get("text")
name = args.get("name")
@@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource):
if "doc_language" not in args:
args["doc_language"] = "English"
- # Validate metadata if provided
- if args.get("doc_type") or args.get("doc_metadata"):
- if not args.get("doc_type") or not args.get("doc_metadata"):
- raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
- if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
- raise InvalidMetadataError(
- "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
- )
-
- if not isinstance(args["doc_metadata"], dict):
- raise InvalidMetadataError("doc_metadata must be a dictionary")
-
- # Validate metadata schema based on doc_type
- if args["doc_type"] != "others":
- metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
- for key, value in args["doc_metadata"].items():
- if key in metadata_schema and not isinstance(value, metadata_schema[key]):
- raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
- # set to MetaDataConfig
- args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
@@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource):
if "doc_language" not in args:
args["doc_language"] = "English"
- # Validate metadata if provided
- if args.get("doc_type") or args.get("doc_metadata"):
- if not args.get("doc_type") or not args.get("doc_metadata"):
- raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
-
- if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
- raise InvalidMetadataError(
- "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
- )
-
- if not isinstance(args["doc_metadata"], dict):
- raise InvalidMetadataError("doc_metadata must be a dictionary")
-
- # Validate metadata schema based on doc_type
- if args["doc_type"] != "others":
- metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
- for key, value in args["doc_metadata"].items():
- if key in metadata_schema and not isinstance(value, metadata_schema[key]):
- raise InvalidMetadataError(f"Invalid type for metadata field {key}")
-
- # set to MetaDataConfig
- args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
-
# get dataset info
dataset_id = str(dataset_id)
tenant_id = str(tenant_id)
diff --git a/api/extensions/ext_commands.py b/api/extensions/ext_commands.py
index 3f5ae539c5..92996f75e5 100644
--- a/api/extensions/ext_commands.py
+++ b/api/extensions/ext_commands.py
@@ -3,7 +3,7 @@ from dify_app import DifyApp
def init_app(app: DifyApp):
from commands import (
- add_qdrant_doc_id_index,
+ add_qdrant_index,
convert_to_agent_apps,
create_tenant,
extract_plugins,
@@ -11,6 +11,7 @@ def init_app(app: DifyApp):
fix_app_site_missing,
install_plugins,
migrate_data_for_plugin,
+ old_metadata_migration,
reset_email,
reset_encrypt_key_pair,
reset_password,
@@ -24,7 +25,7 @@ def init_app(app: DifyApp):
reset_encrypt_key_pair,
vdb_migrate,
convert_to_agent_apps,
- add_qdrant_doc_id_index,
+ add_qdrant_index,
create_tenant,
upgrade_db,
fix_app_site_missing,
@@ -32,6 +33,7 @@ def init_app(app: DifyApp):
extract_plugins,
extract_unique_plugins,
install_plugins,
+ old_metadata_migration,
]
for cmd in cmds_to_register:
app.cli.add_command(cmd)
diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py
index 7ce4e4af22..d3654a3d48 100644
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding
from services.entities.knowledge_entities.knowledge_entities import (
ChildChunkUpdateArgs,
KnowledgeConfig,
- MetaDataConfig,
RerankingModel,
RetrievalModel,
SegmentUpdateArgs,
@@ -999,9 +998,6 @@ class DocumentService:
document.data_source_info = json.dumps(data_source_info)
document.batch = batch
document.indexing_status = "waiting"
- if knowledge_config.metadata:
- document.doc_type = knowledge_config.metadata.doc_type
- document.metadata = knowledge_config.metadata.doc_metadata
db.session.add(document)
documents.append(document)
duplicate_document_ids.append(document.id)
@@ -1018,7 +1014,6 @@ class DocumentService:
account,
file_name,
batch,
- knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1076,7 +1071,6 @@ class DocumentService:
account,
truncated_page_name,
batch,
- knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1117,7 +1111,6 @@ class DocumentService:
account,
document_name,
batch,
- knowledge_config.metadata,
)
db.session.add(document)
db.session.flush()
@@ -1155,7 +1148,6 @@ class DocumentService:
account: Account,
name: str,
batch: str,
- metadata: Optional[MetaDataConfig] = None,
):
document = Document(
tenant_id=dataset.tenant_id,
@@ -1180,9 +1172,6 @@ class DocumentService:
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
BuiltInField.source: data_source_type,
}
- if metadata is not None:
- doc_metadata.update(metadata.doc_metadata)
- document.doc_type = metadata.doc_type
if doc_metadata:
document.doc_metadata = doc_metadata
return document
@@ -1297,10 +1286,6 @@ class DocumentService:
# update document name
if document_data.name:
document.name = document_data.name
- # update doc_type and doc_metadata if provided
- if document_data.metadata is not None:
- document.doc_metadata = document_data.metadata.doc_metadata
- document.doc_type = document_data.metadata.doc_type
# update document to be waiting
document.indexing_status = "waiting"
document.completed_at = None
diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py
index 37c0fb49e5..51ce596e5c 100644
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
embedding_model: Optional[str] = None
embedding_model_provider: Optional[str] = None
name: Optional[str] = None
- metadata: Optional[MetaDataConfig] = None
class SegmentUpdateArgs(BaseModel):
diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
index 7d32f8cebe..a5f4c40ef6 100644
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -47,44 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
Document content
-
- Type of document (optional):
- - book
Book
- - web_page
Web page
- - paper
Academic paper/article
- - social_media_post
Social media post
- - wikipedia_entry
Wikipedia entry
- - personal_document
Personal document
- - business_document
Business document
- - im_chat_log
Chat log
- - synced_from_notion
Notion document
- - synced_from_github
GitHub document
- - others
Other document types
-
-
- Document metadata (required if doc_type is provided). Fields vary by doc_type:
- For book
:
- - title
Book title
- - language
Book language
- - author
Book author
- - publisher
Publisher name
- - publication_date
Publication date
- - isbn
ISBN number
- - category
Book category
-
- For web_page
:
- - title
Page title
- - url
Page URL
- - language
Page language
- - publish_date
Publish date
- - author/publisher
Author or publisher
- - topic/keywords
Topic or keywords
- - description
Page description
-
- Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
-
- For doc_type "others", any valid JSON object is accepted
-
Index mode
- high_quality
High quality: embedding using embedding model, built as vector database index
@@ -233,68 +195,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- hierarchical_model
Parent-child mode
- qa_model
Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
- - doc_type
Type of document (optional)
- - book
Book
- Document records a book or publication
- - web_page
Web page
- Document records web page content
- - paper
Academic paper/article
- Document records academic paper or research article
- - social_media_post
Social media post
- Content from social media posts
- - wikipedia_entry
Wikipedia entry
- Content from Wikipedia entries
- - personal_document
Personal document
- Documents related to personal content
- - business_document
Business document
- Documents related to business content
- - im_chat_log
Chat log
- Records of instant messaging chats
- - synced_from_notion
Notion document
- Documents synchronized from Notion
- - synced_from_github
GitHub document
- Documents synchronized from GitHub
- - others
Other document types
- Other document types not listed above
-
- - doc_metadata
Document metadata (required if doc_type is provided)
- Fields vary by doc_type:
-
- For book
:
- - title
Book title
- Title of the book
- - language
Book language
- Language of the book
- - author
Book author
- Author of the book
- - publisher
Publisher name
- Name of the publishing house
- - publication_date
Publication date
- Date when the book was published
- - isbn
ISBN number
- International Standard Book Number
- - category
Book category
- Category or genre of the book
-
- For web_page
:
- - title
Page title
- Title of the web page
- - url
Page URL
- URL address of the web page
- - language
Page language
- Language of the web page
- - publish_date
Publish date
- Date when the web page was published
- - author/publisher
Author or publisher
- Author or publisher of the web page
- - topic/keywords
Topic or keywords
- Topics or keywords of the web page
- - description
Page description
- Description of the web page content
-
- Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
- For doc_type "others", any valid JSON object is accepted
-
- doc_language
In Q&A mode, specify the language of the document, for example: English
, Chinese
- process_rule
Processing rules
@@ -407,44 +307,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
Knowledge description (optional)
-
- Type of document (optional):
- - book
Book
- - web_page
Web page
- - paper
Academic paper/article
- - social_media_post
Social media post
- - wikipedia_entry
Wikipedia entry
- - personal_document
Personal document
- - business_document
Business document
- - im_chat_log
Chat log
- - synced_from_notion
Notion document
- - synced_from_github
GitHub document
- - others
Other document types
-
-
- Document metadata (required if doc_type is provided). Fields vary by doc_type:
- For book
:
- - title
Book title
- - language
Book language
- - author
Book author
- - publisher
Publisher name
- - publication_date
Publication date
- - isbn
ISBN number
- - category
Book category
-
- For web_page
:
- - title
Page title
- - url
Page URL
- - language
Page language
- - publish_date
Publish date
- - author/publisher
Author or publisher
- - topic/keywords
Topic or keywords
- - description
Page description
-
- Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
-
- For doc_type "others", any valid JSON object is accepted
-
Index technique (optional)
- high_quality
High quality
@@ -762,67 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- separator
Segmentation identifier. Currently, only one delimiter is allowed. The default is ***
- max_tokens
The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
- chunk_overlap
Define the overlap between adjacent chunks (optional)
- - doc_type
Type of document (optional)
- - book
Book
- Document records a book or publication
- - web_page
Web page
- Document records web page content
- - paper
Academic paper/article
- Document records academic paper or research article
- - social_media_post
Social media post
- Content from social media posts
- - wikipedia_entry
Wikipedia entry
- Content from Wikipedia entries
- - personal_document
Personal document
- Documents related to personal content
- - business_document
Business document
- Documents related to business content
- - im_chat_log
Chat log
- Records of instant messaging chats
- - synced_from_notion
Notion document
- Documents synchronized from Notion
- - synced_from_github
GitHub document
- Documents synchronized from GitHub
- - others
Other document types
- Other document types not listed above
-
- - doc_metadata
Document metadata (required if doc_type is provided)
- Fields vary by doc_type:
-
- For book
:
- - title
Book title
- Title of the book
- - language
Book language
- Language of the book
- - author
Book author
- Author of the book
- - publisher
Publisher name
- Name of the publishing house
- - publication_date
Publication date
- Date when the book was published
- - isbn
ISBN number
- International Standard Book Number
- - category
Book category
- Category or genre of the book
-
- For web_page
:
- - title
Page title
- Title of the web page
- - url
Page URL
- URL address of the web page
- - language
Page language
- Language of the web page
- - publish_date
Publish date
- Date when the web page was published
- - author/publisher
Author or publisher
- Author or publisher of the web page
- - topic/keywords
Topic or keywords
- Topics or keywords of the web page
- - description
Page description
- Description of the web page content
-
- Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
- For doc_type "others", any valid JSON object is accepted
@@ -1528,7 +1329,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
"data_source_type": "upload_file",
"name": "readme.txt",
- "doc_type": null
}
},
"score": 3.730463140527718e-05,
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index 8bd3d8d5eb..282849f3db 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -47,46 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
文档内容
-
- 文档类型(选填)
- - book
图书 Book
- - web_page
网页 Web page
- - paper
学术论文/文章 Academic paper/article
- - social_media_post
社交媒体帖子 Social media post
- - wikipedia_entry
维基百科条目 Wikipedia entry
- - personal_document
个人文档 Personal document
- - business_document
商业文档 Business document
- - im_chat_log
即时通讯记录 Chat log
- - synced_from_notion
Notion同步文档 Notion document
- - synced_from_github
GitHub同步文档 GitHub document
- - others
其他文档类型 Other document types
-
-
-
- 文档元数据(如提供文档类型则必填)。字段因文档类型而异:
-
- 针对图书 For book
:
- - title
书名 Book title
- - language
图书语言 Book language
- - author
作者 Book author
- - publisher
出版社 Publisher name
- - publication_date
出版日期 Publication date
- - isbn
ISBN号码 ISBN number
- - category
图书分类 Book category
-
- 针对网页 For web_page
:
- - title
页面标题 Page title
- - url
页面网址 Page URL
- - language
页面语言 Page language
- - publish_date
发布日期 Publish date
- - author/publisher
作者/发布者 Author or publisher
- - topic/keywords
主题/关键词 Topic or keywords
- - description
页面描述 Page description
-
- 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
- 针对"其他"类型文档,接受任何有效的JSON对象
-
索引方式
- high_quality
高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
@@ -234,68 +194,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- text_model
text 文档直接 embedding,经济模式默认为该模式
- hierarchical_model
parent-child 模式
- qa_model
Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
- - doc_type
文档类型(选填)Type of document (optional)
- - book
图书
- 文档记录一本书籍或出版物
- - web_page
网页
- 网页内容的文档记录
- - paper
学术论文/文章
- 学术论文或研究文章的记录
- - social_media_post
社交媒体帖子
- 社交媒体上的帖子内容
- - wikipedia_entry
维基百科条目
- 维基百科的词条内容
- - personal_document
个人文档
- 个人相关的文档记录
- - business_document
商业文档
- 商业相关的文档记录
- - im_chat_log
即时通讯记录
- 即时通讯的聊天记录
- - synced_from_notion
Notion同步文档
- 从Notion同步的文档内容
- - synced_from_github
GitHub同步文档
- 从GitHub同步的文档内容
- - others
其他文档类型
- 其他未列出的文档类型
-
- - doc_metadata
文档元数据(如提供文档类型则必填
- 字段因文档类型而异
-
- 针对图书类型 For book
:
- - title
书名
- 书籍的标题
- - language
图书语言
- 书籍的语言
- - author
作者
- 书籍的作者
- - publisher
出版社
- 出版社的名称
- - publication_date
出版日期
- 书籍的出版日期
- - isbn
ISBN号码
- 书籍的ISBN编号
- - category
图书分类
- 书籍的分类类别
-
- 针对网页类型 For web_page
:
- - title
页面标题
- 网页的标题
- - url
页面网址
- 网页的URL地址
- - language
页面语言
- 网页的语言
- - publish_date
发布日期
- 网页的发布日期
- - author/publisher
作者/发布者
- 网页的作者或发布者
- - topic/keywords
主题/关键词
- 网页的主题或关键词
- - description
页面描述
- 网页的描述信息
-
- 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
- 针对"其他"类型文档,接受任何有效的JSON对象
- doc_language
在 Q&A 模式下,指定文档的语言,例如:English
、Chinese
@@ -606,46 +504,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
文档内容(选填)
-
- 文档类型(选填)
- - book
图书 Book
- - web_page
网页 Web page
- - paper
学术论文/文章 Academic paper/article
- - social_media_post
社交媒体帖子 Social media post
- - wikipedia_entry
维基百科条目 Wikipedia entry
- - personal_document
个人文档 Personal document
- - business_document
商业文档 Business document
- - im_chat_log
即时通讯记录 Chat log
- - synced_from_notion
Notion同步文档 Notion document
- - synced_from_github
GitHub同步文档 GitHub document
- - others
其他文档类型 Other document types
-
-
-
- 文档元数据(如提供文档类型则必填)。字段因文档类型而异:
-
- 针对图书 For book
:
- - title
书名 Book title
- - language
图书语言 Book language
- - author
作者 Book author
- - publisher
出版社 Publisher name
- - publication_date
出版日期 Publication date
- - isbn
ISBN号码 ISBN number
- - category
图书分类 Book category
-
- 针对网页 For web_page
:
- - title
页面标题 Page title
- - url
页面网址 Page URL
- - language
页面语言 Page language
- - publish_date
发布日期 Publish date
- - author/publisher
作者/发布者 Author or publisher
- - topic/keywords
主题/关键词 Topic or keywords
- - description
页面描述 Page description
-
- 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
- 针对"其他"类型文档,接受任何有效的JSON对象
-
处理规则(选填)
- mode
(string) 清洗、分段模式 ,automatic 自动 / custom 自定义
@@ -766,68 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- separator
分段标识符,目前仅允许设置一个分隔符。默认为 ***
- max_tokens
最大长度 (token) 需要校验小于父级的长度
- chunk_overlap
分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
- - doc_type
文档类型(选填)Type of document (optional)
- - book
图书
- 文档记录一本书籍或出版物
- - web_page
网页
- 网页内容的文档记录
- - paper
学术论文/文章
- 学术论文或研究文章的记录
- - social_media_post
社交媒体帖子
- 社交媒体上的帖子内容
- - wikipedia_entry
维基百科条目
- 维基百科的词条内容
- - personal_document
个人文档
- 个人相关的文档记录
- - business_document
商业文档
- 商业相关的文档记录
- - im_chat_log
即时通讯记录
- 即时通讯的聊天记录
- - synced_from_notion
Notion同步文档
- 从Notion同步的文档内容
- - synced_from_github
GitHub同步文档
- 从GitHub同步的文档内容
- - others
其他文档类型
- 其他未列出的文档类型
-
- - doc_metadata
文档元数据(如提供文档类型则必填
- 字段因文档类型而异
-
- 针对图书类型 For book
:
- - title
书名
- 书籍的标题
- - language
图书语言
- 书籍的语言
- - author
作者
- 书籍的作者
- - publisher
出版社
- 出版社的名称
- - publication_date
出版日期
- 书籍的出版日期
- - isbn
ISBN号码
- 书籍的ISBN编号
- - category
图书分类
- 书籍的分类类别
-
- 针对网页类型 For web_page
:
- - title
页面标题
- 网页的标题
- - url
页面网址
- 网页的URL地址
- - language
页面语言
- 网页的语言
- - publish_date
发布日期
- 网页的发布日期
- - author/publisher
作者/发布者
- 网页的作者或发布者
- - topic/keywords
主题/关键词
- 网页的主题或关键词
- - description
页面描述
- 网页的描述信息
-
- 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
-
- 针对"其他"类型文档,接受任何有效的JSON对象
@@ -1534,7 +1330,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
"data_source_type": "upload_file",
"name": "readme.txt",
- "doc_type": null
}
},
"score": 3.730463140527718e-05,