mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-14 04:35:56 +08:00
feat: add administrative commands to free up storage space by removing unused files (#18835)
This commit is contained in:
parent
b62eb61400
commit
993ef87dca
254
api/commands.py
254
api/commands.py
@ -17,6 +17,7 @@ from core.rag.models.document import Document
|
|||||||
from events.app_event import app_was_created
|
from events.app_event import app_was_created
|
||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
from extensions.ext_redis import redis_client
|
from extensions.ext_redis import redis_client
|
||||||
|
from extensions.ext_storage import storage
|
||||||
from libs.helper import email as email_validate
|
from libs.helper import email as email_validate
|
||||||
from libs.password import hash_password, password_pattern, valid_password
|
from libs.password import hash_password, password_pattern, valid_password
|
||||||
from libs.rsa import generate_key_pair
|
from libs.rsa import generate_key_pair
|
||||||
@ -815,3 +816,256 @@ def clear_free_plan_tenant_expired_logs(days: int, batch: int, tenant_ids: list[
|
|||||||
ClearFreePlanTenantExpiredLogs.process(days, batch, tenant_ids)
|
ClearFreePlanTenantExpiredLogs.process(days, batch, tenant_ids)
|
||||||
|
|
||||||
click.echo(click.style("Clear free plan tenant expired logs completed.", fg="green"))
|
click.echo(click.style("Clear free plan tenant expired logs completed.", fg="green"))
|
||||||
|
|
||||||
|
|
||||||
|
@click.command("clear-orphaned-file-records", help="Clear orphaned file records.")
|
||||||
|
def clear_orphaned_file_records():
|
||||||
|
"""
|
||||||
|
Clear orphaned file records in the database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# define tables and columns to process
|
||||||
|
files_tables = [
|
||||||
|
{"table": "upload_files", "id_column": "id", "key_column": "key"},
|
||||||
|
{"table": "tool_files", "id_column": "id", "key_column": "file_key"},
|
||||||
|
]
|
||||||
|
ids_tables = [
|
||||||
|
{"type": "uuid", "table": "message_files", "column": "upload_file_id"},
|
||||||
|
{"type": "text", "table": "documents", "column": "data_source_info"},
|
||||||
|
{"type": "text", "table": "document_segments", "column": "content"},
|
||||||
|
{"type": "text", "table": "messages", "column": "answer"},
|
||||||
|
{"type": "text", "table": "workflow_node_executions", "column": "inputs"},
|
||||||
|
{"type": "text", "table": "workflow_node_executions", "column": "process_data"},
|
||||||
|
{"type": "text", "table": "workflow_node_executions", "column": "outputs"},
|
||||||
|
{"type": "text", "table": "conversations", "column": "introduction"},
|
||||||
|
{"type": "text", "table": "conversations", "column": "system_instruction"},
|
||||||
|
{"type": "json", "table": "messages", "column": "inputs"},
|
||||||
|
{"type": "json", "table": "messages", "column": "message"},
|
||||||
|
]
|
||||||
|
|
||||||
|
# notify user and ask for confirmation
|
||||||
|
click.echo(
|
||||||
|
click.style("This command will find and delete orphaned file records in the following tables:", fg="yellow")
|
||||||
|
)
|
||||||
|
for files_table in files_tables:
|
||||||
|
click.echo(click.style(f"- {files_table['table']}", fg="yellow"))
|
||||||
|
click.echo(
|
||||||
|
click.style("The following tables and columns will be scanned to find orphaned file records:", fg="yellow")
|
||||||
|
)
|
||||||
|
for ids_table in ids_tables:
|
||||||
|
click.echo(click.style(f"- {ids_table['table']} ({ids_table['column']})", fg="yellow"))
|
||||||
|
click.echo("")
|
||||||
|
|
||||||
|
click.echo(click.style("!!! USE WITH CAUTION !!!", fg="red"))
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
(
|
||||||
|
"Since not all patterns have been fully tested, "
|
||||||
|
"please note that this command may delete unintended file records."
|
||||||
|
),
|
||||||
|
fg="yellow",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
click.echo(
|
||||||
|
click.style("This cannot be undone. Please make sure to back up your database before proceeding.", fg="yellow")
|
||||||
|
)
|
||||||
|
click.confirm("Do you want to proceed?", abort=True)
|
||||||
|
|
||||||
|
# start the cleanup process
|
||||||
|
click.echo(click.style("Starting orphaned file records cleanup.", fg="white"))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# fetch file id and keys from each table
|
||||||
|
all_files_in_tables = []
|
||||||
|
for files_table in files_tables:
|
||||||
|
click.echo(click.style(f"- Listing file records in table {files_table['table']}", fg="white"))
|
||||||
|
query = f"SELECT {files_table['id_column']}, {files_table['key_column']} FROM {files_table['table']}"
|
||||||
|
with db.engine.begin() as conn:
|
||||||
|
rs = conn.execute(db.text(query))
|
||||||
|
for i in rs:
|
||||||
|
all_files_in_tables.append({"table": files_table["table"], "id": str(i[0]), "key": i[1]})
|
||||||
|
click.echo(click.style(f"Found {len(all_files_in_tables)} files in tables.", fg="white"))
|
||||||
|
|
||||||
|
# fetch referred table and columns
|
||||||
|
guid_regexp = "[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
|
||||||
|
all_ids_in_tables = []
|
||||||
|
for ids_table in ids_tables:
|
||||||
|
query = ""
|
||||||
|
if ids_table["type"] == "uuid":
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
f"- Listing file ids in column {ids_table['column']} in table {ids_table['table']}", fg="white"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
query = (
|
||||||
|
f"SELECT {ids_table['column']} FROM {ids_table['table']} WHERE {ids_table['column']} IS NOT NULL"
|
||||||
|
)
|
||||||
|
with db.engine.begin() as conn:
|
||||||
|
rs = conn.execute(db.text(query))
|
||||||
|
for i in rs:
|
||||||
|
all_ids_in_tables.append({"table": ids_table["table"], "id": str(i[0])})
|
||||||
|
elif ids_table["type"] == "text":
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
f"- Listing file-id-like strings in column {ids_table['column']} in table {ids_table['table']}",
|
||||||
|
fg="white",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
query = (
|
||||||
|
f"SELECT regexp_matches({ids_table['column']}, '{guid_regexp}', 'g') AS extracted_id "
|
||||||
|
f"FROM {ids_table['table']}"
|
||||||
|
)
|
||||||
|
with db.engine.begin() as conn:
|
||||||
|
rs = conn.execute(db.text(query))
|
||||||
|
for i in rs:
|
||||||
|
for j in i[0]:
|
||||||
|
all_ids_in_tables.append({"table": ids_table["table"], "id": j})
|
||||||
|
elif ids_table["type"] == "json":
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
(
|
||||||
|
f"- Listing file-id-like JSON string in column {ids_table['column']} "
|
||||||
|
f"in table {ids_table['table']}"
|
||||||
|
),
|
||||||
|
fg="white",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
query = (
|
||||||
|
f"SELECT regexp_matches({ids_table['column']}::text, '{guid_regexp}', 'g') AS extracted_id "
|
||||||
|
f"FROM {ids_table['table']}"
|
||||||
|
)
|
||||||
|
with db.engine.begin() as conn:
|
||||||
|
rs = conn.execute(db.text(query))
|
||||||
|
for i in rs:
|
||||||
|
for j in i[0]:
|
||||||
|
all_ids_in_tables.append({"table": ids_table["table"], "id": j})
|
||||||
|
click.echo(click.style(f"Found {len(all_ids_in_tables)} file ids in tables.", fg="white"))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f"Error fetching keys: {str(e)}", fg="red"))
|
||||||
|
return
|
||||||
|
|
||||||
|
# find orphaned files
|
||||||
|
all_files = [file["id"] for file in all_files_in_tables]
|
||||||
|
all_ids = [file["id"] for file in all_ids_in_tables]
|
||||||
|
orphaned_files = list(set(all_files) - set(all_ids))
|
||||||
|
if not orphaned_files:
|
||||||
|
click.echo(click.style("No orphaned file records found. There is nothing to delete.", fg="green"))
|
||||||
|
return
|
||||||
|
click.echo(click.style(f"Found {len(orphaned_files)} orphaned file records.", fg="white"))
|
||||||
|
for file in orphaned_files:
|
||||||
|
click.echo(click.style(f"- orphaned file id: {file}", fg="black"))
|
||||||
|
click.confirm(f"Do you want to proceed to delete all {len(orphaned_files)} orphaned file records?", abort=True)
|
||||||
|
|
||||||
|
# delete orphaned records for each file
|
||||||
|
try:
|
||||||
|
for files_table in files_tables:
|
||||||
|
click.echo(click.style(f"- Deleting orphaned file records in table {files_table['table']}", fg="white"))
|
||||||
|
query = f"DELETE FROM {files_table['table']} WHERE {files_table['id_column']} IN :ids"
|
||||||
|
with db.engine.begin() as conn:
|
||||||
|
conn.execute(db.text(query), {"ids": tuple(orphaned_files)})
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f"Error deleting orphaned file records: {str(e)}", fg="red"))
|
||||||
|
return
|
||||||
|
click.echo(click.style(f"Removed {len(orphaned_files)} orphaned file records.", fg="green"))
|
||||||
|
|
||||||
|
|
||||||
|
@click.command("remove-orphaned-files-on-storage", help="Remove orphaned files on the storage.")
|
||||||
|
def remove_orphaned_files_on_storage():
|
||||||
|
"""
|
||||||
|
Remove orphaned files on the storage.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# define tables and columns to process
|
||||||
|
files_tables = [
|
||||||
|
{"table": "upload_files", "key_column": "key"},
|
||||||
|
{"table": "tool_files", "key_column": "file_key"},
|
||||||
|
]
|
||||||
|
storage_paths = ["image_files", "tools", "upload_files"]
|
||||||
|
|
||||||
|
# notify user and ask for confirmation
|
||||||
|
click.echo(click.style("This command will find and remove orphaned files on the storage,", fg="yellow"))
|
||||||
|
click.echo(
|
||||||
|
click.style("by comparing the files on the storage with the records in the following tables:", fg="yellow")
|
||||||
|
)
|
||||||
|
for files_table in files_tables:
|
||||||
|
click.echo(click.style(f"- {files_table['table']}", fg="yellow"))
|
||||||
|
click.echo(click.style("The following paths on the storage will be scanned to find orphaned files:", fg="yellow"))
|
||||||
|
for storage_path in storage_paths:
|
||||||
|
click.echo(click.style(f"- {storage_path}", fg="yellow"))
|
||||||
|
click.echo("")
|
||||||
|
|
||||||
|
click.echo(click.style("!!! USE WITH CAUTION !!!", fg="red"))
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
"Currently, this command will work only for opendal based storage (STORAGE_TYPE=opendal).", fg="yellow"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
click.echo(
|
||||||
|
click.style(
|
||||||
|
"Since not all patterns have been fully tested, please note that this command may delete unintended files.",
|
||||||
|
fg="yellow",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
click.echo(
|
||||||
|
click.style("This cannot be undone. Please make sure to back up your database before proceeding.", fg="yellow")
|
||||||
|
)
|
||||||
|
click.confirm("Do you want to proceed?", abort=True)
|
||||||
|
|
||||||
|
# start the cleanup process
|
||||||
|
click.echo(click.style("Starting orphaned files cleanup.", fg="white"))
|
||||||
|
|
||||||
|
# fetch file id and keys from each table
|
||||||
|
all_files_in_tables = []
|
||||||
|
try:
|
||||||
|
for files_table in files_tables:
|
||||||
|
click.echo(click.style(f"- Listing files from table {files_table['table']}", fg="white"))
|
||||||
|
query = f"SELECT {files_table['key_column']} FROM {files_table['table']}"
|
||||||
|
with db.engine.begin() as conn:
|
||||||
|
rs = conn.execute(db.text(query))
|
||||||
|
for i in rs:
|
||||||
|
all_files_in_tables.append(str(i[0]))
|
||||||
|
click.echo(click.style(f"Found {len(all_files_in_tables)} files in tables.", fg="white"))
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f"Error fetching keys: {str(e)}", fg="red"))
|
||||||
|
|
||||||
|
all_files_on_storage = []
|
||||||
|
for storage_path in storage_paths:
|
||||||
|
try:
|
||||||
|
click.echo(click.style(f"- Scanning files on storage path {storage_path}", fg="white"))
|
||||||
|
files = storage.scan(path=storage_path, files=True, directories=False)
|
||||||
|
all_files_on_storage.extend(files)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
click.echo(click.style(f" -> Skipping path {storage_path} as it does not exist.", fg="yellow"))
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(click.style(f" -> Error scanning files on storage path {storage_path}: {str(e)}", fg="red"))
|
||||||
|
continue
|
||||||
|
click.echo(click.style(f"Found {len(all_files_on_storage)} files on storage.", fg="white"))
|
||||||
|
|
||||||
|
# find orphaned files
|
||||||
|
orphaned_files = list(set(all_files_on_storage) - set(all_files_in_tables))
|
||||||
|
if not orphaned_files:
|
||||||
|
click.echo(click.style("No orphaned files found. There is nothing to remove.", fg="green"))
|
||||||
|
return
|
||||||
|
click.echo(click.style(f"Found {len(orphaned_files)} orphaned files.", fg="white"))
|
||||||
|
for file in orphaned_files:
|
||||||
|
click.echo(click.style(f"- orphaned file: {file}", fg="black"))
|
||||||
|
click.confirm(f"Do you want to proceed to remove all {len(orphaned_files)} orphaned files?", abort=True)
|
||||||
|
|
||||||
|
# delete orphaned files
|
||||||
|
removed_files = 0
|
||||||
|
error_files = 0
|
||||||
|
for file in orphaned_files:
|
||||||
|
try:
|
||||||
|
storage.delete(file)
|
||||||
|
removed_files += 1
|
||||||
|
click.echo(click.style(f"- Removing orphaned file: {file}", fg="white"))
|
||||||
|
except Exception as e:
|
||||||
|
error_files += 1
|
||||||
|
click.echo(click.style(f"- Error deleting orphaned file {file}: {str(e)}", fg="red"))
|
||||||
|
continue
|
||||||
|
if error_files == 0:
|
||||||
|
click.echo(click.style(f"Removed {removed_files} orphaned files without errors.", fg="green"))
|
||||||
|
else:
|
||||||
|
click.echo(click.style(f"Removed {removed_files} orphaned files, with {error_files} errors.", fg="yellow"))
|
||||||
|
@ -5,6 +5,7 @@ def init_app(app: DifyApp):
|
|||||||
from commands import (
|
from commands import (
|
||||||
add_qdrant_index,
|
add_qdrant_index,
|
||||||
clear_free_plan_tenant_expired_logs,
|
clear_free_plan_tenant_expired_logs,
|
||||||
|
clear_orphaned_file_records,
|
||||||
convert_to_agent_apps,
|
convert_to_agent_apps,
|
||||||
create_tenant,
|
create_tenant,
|
||||||
extract_plugins,
|
extract_plugins,
|
||||||
@ -13,6 +14,7 @@ def init_app(app: DifyApp):
|
|||||||
install_plugins,
|
install_plugins,
|
||||||
migrate_data_for_plugin,
|
migrate_data_for_plugin,
|
||||||
old_metadata_migration,
|
old_metadata_migration,
|
||||||
|
remove_orphaned_files_on_storage,
|
||||||
reset_email,
|
reset_email,
|
||||||
reset_encrypt_key_pair,
|
reset_encrypt_key_pair,
|
||||||
reset_password,
|
reset_password,
|
||||||
@ -36,6 +38,8 @@ def init_app(app: DifyApp):
|
|||||||
install_plugins,
|
install_plugins,
|
||||||
old_metadata_migration,
|
old_metadata_migration,
|
||||||
clear_free_plan_tenant_expired_logs,
|
clear_free_plan_tenant_expired_logs,
|
||||||
|
clear_orphaned_file_records,
|
||||||
|
remove_orphaned_files_on_storage,
|
||||||
]
|
]
|
||||||
for cmd in cmds_to_register:
|
for cmd in cmds_to_register:
|
||||||
app.cli.add_command(cmd)
|
app.cli.add_command(cmd)
|
||||||
|
@ -102,6 +102,9 @@ class Storage:
|
|||||||
def delete(self, filename):
|
def delete(self, filename):
|
||||||
return self.storage_runner.delete(filename)
|
return self.storage_runner.delete(filename)
|
||||||
|
|
||||||
|
def scan(self, path: str, files: bool = True, directories: bool = False) -> list[str]:
|
||||||
|
return self.storage_runner.scan(path, files=files, directories=directories)
|
||||||
|
|
||||||
|
|
||||||
storage = Storage()
|
storage = Storage()
|
||||||
|
|
||||||
|
@ -30,3 +30,11 @@ class BaseStorage(ABC):
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def delete(self, filename):
|
def delete(self, filename):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def scan(self, path, files=True, directories=False) -> list[str]:
|
||||||
|
"""
|
||||||
|
Scan files and directories in the given path.
|
||||||
|
This method is implemented only in some storage backends.
|
||||||
|
If a storage backend doesn't support scanning, it will raise NotImplementedError.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("This storage backend doesn't support scanning")
|
||||||
|
@ -80,3 +80,20 @@ class OpenDALStorage(BaseStorage):
|
|||||||
logger.debug(f"file {filename} deleted")
|
logger.debug(f"file {filename} deleted")
|
||||||
return
|
return
|
||||||
logger.debug(f"file {filename} not found, skip delete")
|
logger.debug(f"file {filename} not found, skip delete")
|
||||||
|
|
||||||
|
def scan(self, path: str, files: bool = True, directories: bool = False) -> list[str]:
|
||||||
|
if not self.exists(path):
|
||||||
|
raise FileNotFoundError("Path not found")
|
||||||
|
|
||||||
|
all_files = self.op.scan(path=path)
|
||||||
|
if files and directories:
|
||||||
|
logger.debug(f"files and directories on {path} scanned")
|
||||||
|
return [f.path for f in all_files]
|
||||||
|
if files:
|
||||||
|
logger.debug(f"files on {path} scanned")
|
||||||
|
return [f.path for f in all_files if not f.path.endswith("/")]
|
||||||
|
elif directories:
|
||||||
|
logger.debug(f"directories on {path} scanned")
|
||||||
|
return [f.path for f in all_files if f.path.endswith("/")]
|
||||||
|
else:
|
||||||
|
raise ValueError("At least one of files or directories must be True")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user