mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 14:59:01 +08:00
Fix/create document by api with metadata (#16307)
Co-authored-by: zxhlyh <jasonapring2015@outlook.com>
This commit is contained in:
parent
c1f3d968bf
commit
2c9af712a2
@ -20,7 +20,7 @@ from libs.helper import email as email_validate
|
||||
from libs.password import hash_password, password_pattern, valid_password
|
||||
from libs.rsa import generate_key_pair
|
||||
from models import Tenant
|
||||
from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment
|
||||
from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation
|
||||
from models.provider import Provider, ProviderModel
|
||||
@ -483,14 +483,11 @@ def convert_to_agent_apps():
|
||||
click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green"))
|
||||
|
||||
|
||||
@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.")
|
||||
@click.command("add-qdrant-index", help="Add Qdrant index.")
|
||||
@click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.")
|
||||
def add_qdrant_doc_id_index(field: str):
|
||||
click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green"))
|
||||
vector_type = dify_config.VECTOR_STORE
|
||||
if vector_type != "qdrant":
|
||||
click.echo(click.style("This command only supports Qdrant vector store.", fg="red"))
|
||||
return
|
||||
def add_qdrant_index(field: str):
|
||||
click.echo(click.style("Starting Qdrant index creation.", fg="green"))
|
||||
|
||||
create_count = 0
|
||||
|
||||
try:
|
||||
@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str):
|
||||
click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green"))
|
||||
|
||||
|
||||
@click.command("old-metadata-migration", help="Old metadata migration.")
|
||||
def old_metadata_migration():
|
||||
"""
|
||||
Old metadata migration.
|
||||
"""
|
||||
click.echo(click.style("Starting old metadata migration.", fg="green"))
|
||||
|
||||
page = 1
|
||||
while True:
|
||||
try:
|
||||
documents = (
|
||||
DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None)
|
||||
.order_by(DatasetDocument.created_at.desc())
|
||||
.paginate(page=page, per_page=50)
|
||||
)
|
||||
except NotFound:
|
||||
break
|
||||
if not documents:
|
||||
break
|
||||
for document in documents:
|
||||
if document.doc_metadata:
|
||||
doc_metadata = document.doc_metadata
|
||||
for key, value in doc_metadata.items():
|
||||
dataset_metadata = (
|
||||
db.session.query(DatasetMetadata)
|
||||
.filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key)
|
||||
.first()
|
||||
)
|
||||
if not dataset_metadata:
|
||||
dataset_metadata = DatasetMetadata(
|
||||
tenant_id=document.tenant_id,
|
||||
dataset_id=document.dataset_id,
|
||||
name=key,
|
||||
type="string",
|
||||
created_by=document.created_by,
|
||||
)
|
||||
db.session.add(dataset_metadata)
|
||||
db.session.flush()
|
||||
dataset_metadata_binding = DatasetMetadataBinding(
|
||||
tenant_id=document.tenant_id,
|
||||
dataset_id=document.dataset_id,
|
||||
metadata_id=dataset_metadata.id,
|
||||
document_id=document.id,
|
||||
created_by=document.created_by,
|
||||
)
|
||||
db.session.add(dataset_metadata_binding)
|
||||
else:
|
||||
dataset_metadata_binding = DatasetMetadataBinding.query.filter(
|
||||
DatasetMetadataBinding.dataset_id == document.dataset_id,
|
||||
DatasetMetadataBinding.document_id == document.id,
|
||||
DatasetMetadataBinding.metadata_id == dataset_metadata.id,
|
||||
).first()
|
||||
if not dataset_metadata_binding:
|
||||
dataset_metadata_binding = DatasetMetadataBinding(
|
||||
tenant_id=document.tenant_id,
|
||||
dataset_id=document.dataset_id,
|
||||
metadata_id=dataset_metadata.id,
|
||||
document_id=document.id,
|
||||
created_by=document.created_by,
|
||||
)
|
||||
db.session.add(dataset_metadata_binding)
|
||||
db.session.commit()
|
||||
page += 1
|
||||
click.echo(click.style("Old metadata migration completed.", fg="green"))
|
||||
|
||||
|
||||
@click.command("create-tenant", help="Create account and tenant.")
|
||||
@click.option("--email", prompt=True, help="Tenant account email.")
|
||||
@click.option("--name", prompt=True, help="Workspace name.")
|
||||
|
@ -18,7 +18,6 @@ from controllers.service_api.app.error import (
|
||||
from controllers.service_api.dataset.error import (
|
||||
ArchivedDocumentImmutableError,
|
||||
DocumentIndexingError,
|
||||
InvalidMetadataError,
|
||||
)
|
||||
from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
|
||||
from core.errors.error import ProviderTokenNotInitError
|
||||
@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource):
|
||||
"indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json"
|
||||
)
|
||||
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
|
||||
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
|
||||
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
dataset_id = str(dataset_id)
|
||||
@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource):
|
||||
if not dataset.indexing_technique and not args["indexing_technique"]:
|
||||
raise ValueError("indexing_technique is required.")
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
text = args.get("text")
|
||||
name = args.get("name")
|
||||
if text is None or name is None:
|
||||
@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
|
||||
"doc_language", type=str, default="English", required=False, nullable=False, location="json"
|
||||
)
|
||||
parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json")
|
||||
parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json")
|
||||
parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource):
|
||||
# indexing_technique is already set in dataset since this is an update
|
||||
args["indexing_technique"] = dataset.indexing_technique
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
if args["text"]:
|
||||
text = args.get("text")
|
||||
name = args.get("name")
|
||||
@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource):
|
||||
if "doc_language" not in args:
|
||||
args["doc_language"] = "English"
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
# get dataset info
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource):
|
||||
if "doc_language" not in args:
|
||||
args["doc_language"] = "English"
|
||||
|
||||
# Validate metadata if provided
|
||||
if args.get("doc_type") or args.get("doc_metadata"):
|
||||
if not args.get("doc_type") or not args.get("doc_metadata"):
|
||||
raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata")
|
||||
|
||||
if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA:
|
||||
raise InvalidMetadataError(
|
||||
"Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys())
|
||||
)
|
||||
|
||||
if not isinstance(args["doc_metadata"], dict):
|
||||
raise InvalidMetadataError("doc_metadata must be a dictionary")
|
||||
|
||||
# Validate metadata schema based on doc_type
|
||||
if args["doc_type"] != "others":
|
||||
metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]]
|
||||
for key, value in args["doc_metadata"].items():
|
||||
if key in metadata_schema and not isinstance(value, metadata_schema[key]):
|
||||
raise InvalidMetadataError(f"Invalid type for metadata field {key}")
|
||||
|
||||
# set to MetaDataConfig
|
||||
args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]}
|
||||
|
||||
# get dataset info
|
||||
dataset_id = str(dataset_id)
|
||||
tenant_id = str(tenant_id)
|
||||
|
@ -3,7 +3,7 @@ from dify_app import DifyApp
|
||||
|
||||
def init_app(app: DifyApp):
|
||||
from commands import (
|
||||
add_qdrant_doc_id_index,
|
||||
add_qdrant_index,
|
||||
convert_to_agent_apps,
|
||||
create_tenant,
|
||||
extract_plugins,
|
||||
@ -11,6 +11,7 @@ def init_app(app: DifyApp):
|
||||
fix_app_site_missing,
|
||||
install_plugins,
|
||||
migrate_data_for_plugin,
|
||||
old_metadata_migration,
|
||||
reset_email,
|
||||
reset_encrypt_key_pair,
|
||||
reset_password,
|
||||
@ -24,7 +25,7 @@ def init_app(app: DifyApp):
|
||||
reset_encrypt_key_pair,
|
||||
vdb_migrate,
|
||||
convert_to_agent_apps,
|
||||
add_qdrant_doc_id_index,
|
||||
add_qdrant_index,
|
||||
create_tenant,
|
||||
upgrade_db,
|
||||
fix_app_site_missing,
|
||||
@ -32,6 +33,7 @@ def init_app(app: DifyApp):
|
||||
extract_plugins,
|
||||
extract_unique_plugins,
|
||||
install_plugins,
|
||||
old_metadata_migration,
|
||||
]
|
||||
for cmd in cmds_to_register:
|
||||
app.cli.add_command(cmd)
|
||||
|
@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding
|
||||
from services.entities.knowledge_entities.knowledge_entities import (
|
||||
ChildChunkUpdateArgs,
|
||||
KnowledgeConfig,
|
||||
MetaDataConfig,
|
||||
RerankingModel,
|
||||
RetrievalModel,
|
||||
SegmentUpdateArgs,
|
||||
@ -999,9 +998,6 @@ class DocumentService:
|
||||
document.data_source_info = json.dumps(data_source_info)
|
||||
document.batch = batch
|
||||
document.indexing_status = "waiting"
|
||||
if knowledge_config.metadata:
|
||||
document.doc_type = knowledge_config.metadata.doc_type
|
||||
document.metadata = knowledge_config.metadata.doc_metadata
|
||||
db.session.add(document)
|
||||
documents.append(document)
|
||||
duplicate_document_ids.append(document.id)
|
||||
@ -1018,7 +1014,6 @@ class DocumentService:
|
||||
account,
|
||||
file_name,
|
||||
batch,
|
||||
knowledge_config.metadata,
|
||||
)
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
@ -1076,7 +1071,6 @@ class DocumentService:
|
||||
account,
|
||||
truncated_page_name,
|
||||
batch,
|
||||
knowledge_config.metadata,
|
||||
)
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
@ -1117,7 +1111,6 @@ class DocumentService:
|
||||
account,
|
||||
document_name,
|
||||
batch,
|
||||
knowledge_config.metadata,
|
||||
)
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
@ -1155,7 +1148,6 @@ class DocumentService:
|
||||
account: Account,
|
||||
name: str,
|
||||
batch: str,
|
||||
metadata: Optional[MetaDataConfig] = None,
|
||||
):
|
||||
document = Document(
|
||||
tenant_id=dataset.tenant_id,
|
||||
@ -1180,9 +1172,6 @@ class DocumentService:
|
||||
BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
BuiltInField.source: data_source_type,
|
||||
}
|
||||
if metadata is not None:
|
||||
doc_metadata.update(metadata.doc_metadata)
|
||||
document.doc_type = metadata.doc_type
|
||||
if doc_metadata:
|
||||
document.doc_metadata = doc_metadata
|
||||
return document
|
||||
@ -1297,10 +1286,6 @@ class DocumentService:
|
||||
# update document name
|
||||
if document_data.name:
|
||||
document.name = document_data.name
|
||||
# update doc_type and doc_metadata if provided
|
||||
if document_data.metadata is not None:
|
||||
document.doc_metadata = document_data.metadata.doc_metadata
|
||||
document.doc_type = document_data.metadata.doc_type
|
||||
# update document to be waiting
|
||||
document.indexing_status = "waiting"
|
||||
document.completed_at = None
|
||||
|
@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel):
|
||||
embedding_model: Optional[str] = None
|
||||
embedding_model_provider: Optional[str] = None
|
||||
name: Optional[str] = None
|
||||
metadata: Optional[MetaDataConfig] = None
|
||||
|
||||
|
||||
class SegmentUpdateArgs(BaseModel):
|
||||
|
@ -47,44 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='text' type='string' key='text'>
|
||||
Document content
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
Type of document (optional):
|
||||
- <code>book</code> Book
|
||||
- <code>web_page</code> Web page
|
||||
- <code>paper</code> Academic paper/article
|
||||
- <code>social_media_post</code> Social media post
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
- <code>personal_document</code> Personal document
|
||||
- <code>business_document</code> Business document
|
||||
- <code>im_chat_log</code> Chat log
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
- <code>others</code> Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
Document metadata (required if doc_type is provided). Fields vary by doc_type:
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
- <code>language</code> Book language
|
||||
- <code>author</code> Book author
|
||||
- <code>publisher</code> Publisher name
|
||||
- <code>publication_date</code> Publication date
|
||||
- <code>isbn</code> ISBN number
|
||||
- <code>category</code> Book category
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
- <code>url</code> Page URL
|
||||
- <code>language</code> Page language
|
||||
- <code>publish_date</code> Publish date
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
- <code>description</code> Page description
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
</Property>
|
||||
<Property name='indexing_technique' type='string' key='indexing_technique'>
|
||||
Index mode
|
||||
- <code>high_quality</code> High quality: embedding using embedding model, built as vector database index
|
||||
@ -233,68 +195,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>hierarchical_model</code> Parent-child mode
|
||||
- <code>qa_model</code> Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions
|
||||
|
||||
- <code>doc_type</code> Type of document (optional)
|
||||
- <code>book</code> Book
|
||||
Document records a book or publication
|
||||
- <code>web_page</code> Web page
|
||||
Document records web page content
|
||||
- <code>paper</code> Academic paper/article
|
||||
Document records academic paper or research article
|
||||
- <code>social_media_post</code> Social media post
|
||||
Content from social media posts
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
Content from Wikipedia entries
|
||||
- <code>personal_document</code> Personal document
|
||||
Documents related to personal content
|
||||
- <code>business_document</code> Business document
|
||||
Documents related to business content
|
||||
- <code>im_chat_log</code> Chat log
|
||||
Records of instant messaging chats
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
Documents synchronized from Notion
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
Documents synchronized from GitHub
|
||||
- <code>others</code> Other document types
|
||||
Other document types not listed above
|
||||
|
||||
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
|
||||
Fields vary by doc_type:
|
||||
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
Title of the book
|
||||
- <code>language</code> Book language
|
||||
Language of the book
|
||||
- <code>author</code> Book author
|
||||
Author of the book
|
||||
- <code>publisher</code> Publisher name
|
||||
Name of the publishing house
|
||||
- <code>publication_date</code> Publication date
|
||||
Date when the book was published
|
||||
- <code>isbn</code> ISBN number
|
||||
International Standard Book Number
|
||||
- <code>category</code> Book category
|
||||
Category or genre of the book
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
Title of the web page
|
||||
- <code>url</code> Page URL
|
||||
URL address of the web page
|
||||
- <code>language</code> Page language
|
||||
Language of the web page
|
||||
- <code>publish_date</code> Publish date
|
||||
Date when the web page was published
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
Author or publisher of the web page
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
Topics or keywords of the web page
|
||||
- <code>description</code> Page description
|
||||
Description of the web page content
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
|
||||
- <code>doc_language</code> In Q&A mode, specify the language of the document, for example: <code>English</code>, <code>Chinese</code>
|
||||
|
||||
- <code>process_rule</code> Processing rules
|
||||
@ -407,44 +307,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='description' type='string' key='description'>
|
||||
Knowledge description (optional)
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
Type of document (optional):
|
||||
- <code>book</code> Book
|
||||
- <code>web_page</code> Web page
|
||||
- <code>paper</code> Academic paper/article
|
||||
- <code>social_media_post</code> Social media post
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
- <code>personal_document</code> Personal document
|
||||
- <code>business_document</code> Business document
|
||||
- <code>im_chat_log</code> Chat log
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
- <code>others</code> Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
Document metadata (required if doc_type is provided). Fields vary by doc_type:
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
- <code>language</code> Book language
|
||||
- <code>author</code> Book author
|
||||
- <code>publisher</code> Publisher name
|
||||
- <code>publication_date</code> Publication date
|
||||
- <code>isbn</code> ISBN number
|
||||
- <code>category</code> Book category
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
- <code>url</code> Page URL
|
||||
- <code>language</code> Page language
|
||||
- <code>publish_date</code> Publish date
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
- <code>description</code> Page description
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
</Property>
|
||||
<Property name='indexing_technique' type='string' key='indexing_technique'>
|
||||
Index technique (optional)
|
||||
- <code>high_quality</code> High quality
|
||||
@ -762,67 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>separator</code> Segmentation identifier. Currently, only one delimiter is allowed. The default is <code>***</code>
|
||||
- <code>max_tokens</code> The maximum length (tokens) must be validated to be shorter than the length of the parent chunk
|
||||
- <code>chunk_overlap</code> Define the overlap between adjacent chunks (optional)
|
||||
- <code>doc_type</code> Type of document (optional)
|
||||
- <code>book</code> Book
|
||||
Document records a book or publication
|
||||
- <code>web_page</code> Web page
|
||||
Document records web page content
|
||||
- <code>paper</code> Academic paper/article
|
||||
Document records academic paper or research article
|
||||
- <code>social_media_post</code> Social media post
|
||||
Content from social media posts
|
||||
- <code>wikipedia_entry</code> Wikipedia entry
|
||||
Content from Wikipedia entries
|
||||
- <code>personal_document</code> Personal document
|
||||
Documents related to personal content
|
||||
- <code>business_document</code> Business document
|
||||
Documents related to business content
|
||||
- <code>im_chat_log</code> Chat log
|
||||
Records of instant messaging chats
|
||||
- <code>synced_from_notion</code> Notion document
|
||||
Documents synchronized from Notion
|
||||
- <code>synced_from_github</code> GitHub document
|
||||
Documents synchronized from GitHub
|
||||
- <code>others</code> Other document types
|
||||
Other document types not listed above
|
||||
|
||||
- <code>doc_metadata</code> Document metadata (required if doc_type is provided)
|
||||
Fields vary by doc_type:
|
||||
|
||||
For <code>book</code>:
|
||||
- <code>title</code> Book title
|
||||
Title of the book
|
||||
- <code>language</code> Book language
|
||||
Language of the book
|
||||
- <code>author</code> Book author
|
||||
Author of the book
|
||||
- <code>publisher</code> Publisher name
|
||||
Name of the publishing house
|
||||
- <code>publication_date</code> Publication date
|
||||
Date when the book was published
|
||||
- <code>isbn</code> ISBN number
|
||||
International Standard Book Number
|
||||
- <code>category</code> Book category
|
||||
Category or genre of the book
|
||||
|
||||
For <code>web_page</code>:
|
||||
- <code>title</code> Page title
|
||||
Title of the web page
|
||||
- <code>url</code> Page URL
|
||||
URL address of the web page
|
||||
- <code>language</code> Page language
|
||||
Language of the web page
|
||||
- <code>publish_date</code> Publish date
|
||||
Date when the web page was published
|
||||
- <code>author/publisher</code> Author or publisher
|
||||
Author or publisher of the web page
|
||||
- <code>topic/keywords</code> Topic or keywords
|
||||
Topics or keywords of the web page
|
||||
- <code>description</code> Page description
|
||||
Description of the web page content
|
||||
|
||||
Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type.
|
||||
For doc_type "others", any valid JSON object is accepted
|
||||
</Property>
|
||||
</Properties>
|
||||
</Col>
|
||||
@ -1528,7 +1329,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
"id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
|
||||
"data_source_type": "upload_file",
|
||||
"name": "readme.txt",
|
||||
"doc_type": null
|
||||
}
|
||||
},
|
||||
"score": 3.730463140527718e-05,
|
||||
|
@ -47,46 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='text' type='string' key='text'>
|
||||
文档内容
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
文档类型(选填)
|
||||
- <code>book</code> 图书 Book
|
||||
- <code>web_page</code> 网页 Web page
|
||||
- <code>paper</code> 学术论文/文章 Academic paper/article
|
||||
- <code>social_media_post</code> 社交媒体帖子 Social media post
|
||||
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
|
||||
- <code>personal_document</code> 个人文档 Personal document
|
||||
- <code>business_document</code> 商业文档 Business document
|
||||
- <code>im_chat_log</code> 即时通讯记录 Chat log
|
||||
- <code>synced_from_notion</code> Notion同步文档 Notion document
|
||||
- <code>synced_from_github</code> GitHub同步文档 GitHub document
|
||||
- <code>others</code> 其他文档类型 Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
|
||||
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
|
||||
|
||||
针对图书 For <code>book</code>:
|
||||
- <code>title</code> 书名 Book title
|
||||
- <code>language</code> 图书语言 Book language
|
||||
- <code>author</code> 作者 Book author
|
||||
- <code>publisher</code> 出版社 Publisher name
|
||||
- <code>publication_date</code> 出版日期 Publication date
|
||||
- <code>isbn</code> ISBN号码 ISBN number
|
||||
- <code>category</code> 图书分类 Book category
|
||||
|
||||
针对网页 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题 Page title
|
||||
- <code>url</code> 页面网址 Page URL
|
||||
- <code>language</code> 页面语言 Page language
|
||||
- <code>publish_date</code> 发布日期 Publish date
|
||||
- <code>author/publisher</code> 作者/发布者 Author or publisher
|
||||
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
|
||||
- <code>description</code> 页面描述 Page description
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
</Property>
|
||||
<Property name='indexing_technique' type='string' key='indexing_technique'>
|
||||
索引方式
|
||||
- <code>high_quality</code> 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引
|
||||
@ -234,68 +194,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>text_model</code> text 文档直接 embedding,经济模式默认为该模式
|
||||
- <code>hierarchical_model</code> parent-child 模式
|
||||
- <code>qa_model</code> Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding
|
||||
- <code>doc_type</code> 文档类型(选填)Type of document (optional)
|
||||
- <code>book</code> 图书
|
||||
文档记录一本书籍或出版物
|
||||
- <code>web_page</code> 网页
|
||||
网页内容的文档记录
|
||||
- <code>paper</code> 学术论文/文章
|
||||
学术论文或研究文章的记录
|
||||
- <code>social_media_post</code> 社交媒体帖子
|
||||
社交媒体上的帖子内容
|
||||
- <code>wikipedia_entry</code> 维基百科条目
|
||||
维基百科的词条内容
|
||||
- <code>personal_document</code> 个人文档
|
||||
个人相关的文档记录
|
||||
- <code>business_document</code> 商业文档
|
||||
商业相关的文档记录
|
||||
- <code>im_chat_log</code> 即时通讯记录
|
||||
即时通讯的聊天记录
|
||||
- <code>synced_from_notion</code> Notion同步文档
|
||||
从Notion同步的文档内容
|
||||
- <code>synced_from_github</code> GitHub同步文档
|
||||
从GitHub同步的文档内容
|
||||
- <code>others</code> 其他文档类型
|
||||
其他未列出的文档类型
|
||||
|
||||
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
|
||||
字段因文档类型而异
|
||||
|
||||
针对图书类型 For <code>book</code>:
|
||||
- <code>title</code> 书名
|
||||
书籍的标题
|
||||
- <code>language</code> 图书语言
|
||||
书籍的语言
|
||||
- <code>author</code> 作者
|
||||
书籍的作者
|
||||
- <code>publisher</code> 出版社
|
||||
出版社的名称
|
||||
- <code>publication_date</code> 出版日期
|
||||
书籍的出版日期
|
||||
- <code>isbn</code> ISBN号码
|
||||
书籍的ISBN编号
|
||||
- <code>category</code> 图书分类
|
||||
书籍的分类类别
|
||||
|
||||
针对网页类型 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题
|
||||
网页的标题
|
||||
- <code>url</code> 页面网址
|
||||
网页的URL地址
|
||||
- <code>language</code> 页面语言
|
||||
网页的语言
|
||||
- <code>publish_date</code> 发布日期
|
||||
网页的发布日期
|
||||
- <code>author/publisher</code> 作者/发布者
|
||||
网页的作者或发布者
|
||||
- <code>topic/keywords</code> 主题/关键词
|
||||
网页的主题或关键词
|
||||
- <code>description</code> 页面描述
|
||||
网页的描述信息
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
|
||||
- <code>doc_language</code> 在 Q&A 模式下,指定文档的语言,例如:<code>English</code>、<code>Chinese</code>
|
||||
|
||||
@ -606,46 +504,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
<Property name='text' type='string' key='text'>
|
||||
文档内容(选填)
|
||||
</Property>
|
||||
<Property name='doc_type' type='string' key='doc_type'>
|
||||
文档类型(选填)
|
||||
- <code>book</code> 图书 Book
|
||||
- <code>web_page</code> 网页 Web page
|
||||
- <code>paper</code> 学术论文/文章 Academic paper/article
|
||||
- <code>social_media_post</code> 社交媒体帖子 Social media post
|
||||
- <code>wikipedia_entry</code> 维基百科条目 Wikipedia entry
|
||||
- <code>personal_document</code> 个人文档 Personal document
|
||||
- <code>business_document</code> 商业文档 Business document
|
||||
- <code>im_chat_log</code> 即时通讯记录 Chat log
|
||||
- <code>synced_from_notion</code> Notion同步文档 Notion document
|
||||
- <code>synced_from_github</code> GitHub同步文档 GitHub document
|
||||
- <code>others</code> 其他文档类型 Other document types
|
||||
</Property>
|
||||
<Property name='doc_metadata' type='object' key='doc_metadata'>
|
||||
|
||||
文档元数据(如提供文档类型则必填)。字段因文档类型而异:
|
||||
|
||||
针对图书 For <code>book</code>:
|
||||
- <code>title</code> 书名 Book title
|
||||
- <code>language</code> 图书语言 Book language
|
||||
- <code>author</code> 作者 Book author
|
||||
- <code>publisher</code> 出版社 Publisher name
|
||||
- <code>publication_date</code> 出版日期 Publication date
|
||||
- <code>isbn</code> ISBN号码 ISBN number
|
||||
- <code>category</code> 图书分类 Book category
|
||||
|
||||
针对网页 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题 Page title
|
||||
- <code>url</code> 页面网址 Page URL
|
||||
- <code>language</code> 页面语言 Page language
|
||||
- <code>publish_date</code> 发布日期 Publish date
|
||||
- <code>author/publisher</code> 作者/发布者 Author or publisher
|
||||
- <code>topic/keywords</code> 主题/关键词 Topic or keywords
|
||||
- <code>description</code> 页面描述 Page description
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
</Property>
|
||||
<Property name='process_rule' type='object' key='process_rule'>
|
||||
处理规则(选填)
|
||||
- <code>mode</code> (string) 清洗、分段模式 ,automatic 自动 / custom 自定义
|
||||
@ -766,68 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
- <code>separator</code> 分段标识符,目前仅允许设置一个分隔符。默认为 <code>***</code>
|
||||
- <code>max_tokens</code> 最大长度 (token) 需要校验小于父级的长度
|
||||
- <code>chunk_overlap</code> 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填)
|
||||
- <code>doc_type</code> 文档类型(选填)Type of document (optional)
|
||||
- <code>book</code> 图书
|
||||
文档记录一本书籍或出版物
|
||||
- <code>web_page</code> 网页
|
||||
网页内容的文档记录
|
||||
- <code>paper</code> 学术论文/文章
|
||||
学术论文或研究文章的记录
|
||||
- <code>social_media_post</code> 社交媒体帖子
|
||||
社交媒体上的帖子内容
|
||||
- <code>wikipedia_entry</code> 维基百科条目
|
||||
维基百科的词条内容
|
||||
- <code>personal_document</code> 个人文档
|
||||
个人相关的文档记录
|
||||
- <code>business_document</code> 商业文档
|
||||
商业相关的文档记录
|
||||
- <code>im_chat_log</code> 即时通讯记录
|
||||
即时通讯的聊天记录
|
||||
- <code>synced_from_notion</code> Notion同步文档
|
||||
从Notion同步的文档内容
|
||||
- <code>synced_from_github</code> GitHub同步文档
|
||||
从GitHub同步的文档内容
|
||||
- <code>others</code> 其他文档类型
|
||||
其他未列出的文档类型
|
||||
|
||||
- <code>doc_metadata</code> 文档元数据(如提供文档类型则必填
|
||||
字段因文档类型而异
|
||||
|
||||
针对图书类型 For <code>book</code>:
|
||||
- <code>title</code> 书名
|
||||
书籍的标题
|
||||
- <code>language</code> 图书语言
|
||||
书籍的语言
|
||||
- <code>author</code> 作者
|
||||
书籍的作者
|
||||
- <code>publisher</code> 出版社
|
||||
出版社的名称
|
||||
- <code>publication_date</code> 出版日期
|
||||
书籍的出版日期
|
||||
- <code>isbn</code> ISBN号码
|
||||
书籍的ISBN编号
|
||||
- <code>category</code> 图书分类
|
||||
书籍的分类类别
|
||||
|
||||
针对网页类型 For <code>web_page</code>:
|
||||
- <code>title</code> 页面标题
|
||||
网页的标题
|
||||
- <code>url</code> 页面网址
|
||||
网页的URL地址
|
||||
- <code>language</code> 页面语言
|
||||
网页的语言
|
||||
- <code>publish_date</code> 发布日期
|
||||
网页的发布日期
|
||||
- <code>author/publisher</code> 作者/发布者
|
||||
网页的作者或发布者
|
||||
- <code>topic/keywords</code> 主题/关键词
|
||||
网页的主题或关键词
|
||||
- <code>description</code> 页面描述
|
||||
网页的描述信息
|
||||
|
||||
请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。
|
||||
|
||||
针对"其他"类型文档,接受任何有效的JSON对象
|
||||
</Property>
|
||||
</Properties>
|
||||
</Col>
|
||||
@ -1534,7 +1330,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
|
||||
"id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2",
|
||||
"data_source_type": "upload_file",
|
||||
"name": "readme.txt",
|
||||
"doc_type": null
|
||||
}
|
||||
},
|
||||
"score": 3.730463140527718e-05,
|
||||
|
Loading…
x
Reference in New Issue
Block a user