feat: add administrative commands to free up storage space by removing unused files (#18835)

This commit is contained in:
kurokobo 2025-04-27 12:11:04 +09:00 committed by GitHub
parent b62eb61400
commit 993ef87dca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 286 additions and 0 deletions

View File

@ -17,6 +17,7 @@ from core.rag.models.document import Document
from events.app_event import app_was_created from events.app_event import app_was_created
from extensions.ext_database import db from extensions.ext_database import db
from extensions.ext_redis import redis_client from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from libs.helper import email as email_validate from libs.helper import email as email_validate
from libs.password import hash_password, password_pattern, valid_password from libs.password import hash_password, password_pattern, valid_password
from libs.rsa import generate_key_pair from libs.rsa import generate_key_pair
@ -815,3 +816,256 @@ def clear_free_plan_tenant_expired_logs(days: int, batch: int, tenant_ids: list[
ClearFreePlanTenantExpiredLogs.process(days, batch, tenant_ids) ClearFreePlanTenantExpiredLogs.process(days, batch, tenant_ids)
click.echo(click.style("Clear free plan tenant expired logs completed.", fg="green")) click.echo(click.style("Clear free plan tenant expired logs completed.", fg="green"))
@click.command("clear-orphaned-file-records", help="Clear orphaned file records.")
def clear_orphaned_file_records():
"""
Clear orphaned file records in the database.
"""
# define tables and columns to process
files_tables = [
{"table": "upload_files", "id_column": "id", "key_column": "key"},
{"table": "tool_files", "id_column": "id", "key_column": "file_key"},
]
ids_tables = [
{"type": "uuid", "table": "message_files", "column": "upload_file_id"},
{"type": "text", "table": "documents", "column": "data_source_info"},
{"type": "text", "table": "document_segments", "column": "content"},
{"type": "text", "table": "messages", "column": "answer"},
{"type": "text", "table": "workflow_node_executions", "column": "inputs"},
{"type": "text", "table": "workflow_node_executions", "column": "process_data"},
{"type": "text", "table": "workflow_node_executions", "column": "outputs"},
{"type": "text", "table": "conversations", "column": "introduction"},
{"type": "text", "table": "conversations", "column": "system_instruction"},
{"type": "json", "table": "messages", "column": "inputs"},
{"type": "json", "table": "messages", "column": "message"},
]
# notify user and ask for confirmation
click.echo(
click.style("This command will find and delete orphaned file records in the following tables:", fg="yellow")
)
for files_table in files_tables:
click.echo(click.style(f"- {files_table['table']}", fg="yellow"))
click.echo(
click.style("The following tables and columns will be scanned to find orphaned file records:", fg="yellow")
)
for ids_table in ids_tables:
click.echo(click.style(f"- {ids_table['table']} ({ids_table['column']})", fg="yellow"))
click.echo("")
click.echo(click.style("!!! USE WITH CAUTION !!!", fg="red"))
click.echo(
click.style(
(
"Since not all patterns have been fully tested, "
"please note that this command may delete unintended file records."
),
fg="yellow",
)
)
click.echo(
click.style("This cannot be undone. Please make sure to back up your database before proceeding.", fg="yellow")
)
click.confirm("Do you want to proceed?", abort=True)
# start the cleanup process
click.echo(click.style("Starting orphaned file records cleanup.", fg="white"))
try:
# fetch file id and keys from each table
all_files_in_tables = []
for files_table in files_tables:
click.echo(click.style(f"- Listing file records in table {files_table['table']}", fg="white"))
query = f"SELECT {files_table['id_column']}, {files_table['key_column']} FROM {files_table['table']}"
with db.engine.begin() as conn:
rs = conn.execute(db.text(query))
for i in rs:
all_files_in_tables.append({"table": files_table["table"], "id": str(i[0]), "key": i[1]})
click.echo(click.style(f"Found {len(all_files_in_tables)} files in tables.", fg="white"))
# fetch referred table and columns
guid_regexp = "[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}"
all_ids_in_tables = []
for ids_table in ids_tables:
query = ""
if ids_table["type"] == "uuid":
click.echo(
click.style(
f"- Listing file ids in column {ids_table['column']} in table {ids_table['table']}", fg="white"
)
)
query = (
f"SELECT {ids_table['column']} FROM {ids_table['table']} WHERE {ids_table['column']} IS NOT NULL"
)
with db.engine.begin() as conn:
rs = conn.execute(db.text(query))
for i in rs:
all_ids_in_tables.append({"table": ids_table["table"], "id": str(i[0])})
elif ids_table["type"] == "text":
click.echo(
click.style(
f"- Listing file-id-like strings in column {ids_table['column']} in table {ids_table['table']}",
fg="white",
)
)
query = (
f"SELECT regexp_matches({ids_table['column']}, '{guid_regexp}', 'g') AS extracted_id "
f"FROM {ids_table['table']}"
)
with db.engine.begin() as conn:
rs = conn.execute(db.text(query))
for i in rs:
for j in i[0]:
all_ids_in_tables.append({"table": ids_table["table"], "id": j})
elif ids_table["type"] == "json":
click.echo(
click.style(
(
f"- Listing file-id-like JSON string in column {ids_table['column']} "
f"in table {ids_table['table']}"
),
fg="white",
)
)
query = (
f"SELECT regexp_matches({ids_table['column']}::text, '{guid_regexp}', 'g') AS extracted_id "
f"FROM {ids_table['table']}"
)
with db.engine.begin() as conn:
rs = conn.execute(db.text(query))
for i in rs:
for j in i[0]:
all_ids_in_tables.append({"table": ids_table["table"], "id": j})
click.echo(click.style(f"Found {len(all_ids_in_tables)} file ids in tables.", fg="white"))
except Exception as e:
click.echo(click.style(f"Error fetching keys: {str(e)}", fg="red"))
return
# find orphaned files
all_files = [file["id"] for file in all_files_in_tables]
all_ids = [file["id"] for file in all_ids_in_tables]
orphaned_files = list(set(all_files) - set(all_ids))
if not orphaned_files:
click.echo(click.style("No orphaned file records found. There is nothing to delete.", fg="green"))
return
click.echo(click.style(f"Found {len(orphaned_files)} orphaned file records.", fg="white"))
for file in orphaned_files:
click.echo(click.style(f"- orphaned file id: {file}", fg="black"))
click.confirm(f"Do you want to proceed to delete all {len(orphaned_files)} orphaned file records?", abort=True)
# delete orphaned records for each file
try:
for files_table in files_tables:
click.echo(click.style(f"- Deleting orphaned file records in table {files_table['table']}", fg="white"))
query = f"DELETE FROM {files_table['table']} WHERE {files_table['id_column']} IN :ids"
with db.engine.begin() as conn:
conn.execute(db.text(query), {"ids": tuple(orphaned_files)})
except Exception as e:
click.echo(click.style(f"Error deleting orphaned file records: {str(e)}", fg="red"))
return
click.echo(click.style(f"Removed {len(orphaned_files)} orphaned file records.", fg="green"))
@click.command("remove-orphaned-files-on-storage", help="Remove orphaned files on the storage.")
def remove_orphaned_files_on_storage():
"""
Remove orphaned files on the storage.
"""
# define tables and columns to process
files_tables = [
{"table": "upload_files", "key_column": "key"},
{"table": "tool_files", "key_column": "file_key"},
]
storage_paths = ["image_files", "tools", "upload_files"]
# notify user and ask for confirmation
click.echo(click.style("This command will find and remove orphaned files on the storage,", fg="yellow"))
click.echo(
click.style("by comparing the files on the storage with the records in the following tables:", fg="yellow")
)
for files_table in files_tables:
click.echo(click.style(f"- {files_table['table']}", fg="yellow"))
click.echo(click.style("The following paths on the storage will be scanned to find orphaned files:", fg="yellow"))
for storage_path in storage_paths:
click.echo(click.style(f"- {storage_path}", fg="yellow"))
click.echo("")
click.echo(click.style("!!! USE WITH CAUTION !!!", fg="red"))
click.echo(
click.style(
"Currently, this command will work only for opendal based storage (STORAGE_TYPE=opendal).", fg="yellow"
)
)
click.echo(
click.style(
"Since not all patterns have been fully tested, please note that this command may delete unintended files.",
fg="yellow",
)
)
click.echo(
click.style("This cannot be undone. Please make sure to back up your database before proceeding.", fg="yellow")
)
click.confirm("Do you want to proceed?", abort=True)
# start the cleanup process
click.echo(click.style("Starting orphaned files cleanup.", fg="white"))
# fetch file id and keys from each table
all_files_in_tables = []
try:
for files_table in files_tables:
click.echo(click.style(f"- Listing files from table {files_table['table']}", fg="white"))
query = f"SELECT {files_table['key_column']} FROM {files_table['table']}"
with db.engine.begin() as conn:
rs = conn.execute(db.text(query))
for i in rs:
all_files_in_tables.append(str(i[0]))
click.echo(click.style(f"Found {len(all_files_in_tables)} files in tables.", fg="white"))
except Exception as e:
click.echo(click.style(f"Error fetching keys: {str(e)}", fg="red"))
all_files_on_storage = []
for storage_path in storage_paths:
try:
click.echo(click.style(f"- Scanning files on storage path {storage_path}", fg="white"))
files = storage.scan(path=storage_path, files=True, directories=False)
all_files_on_storage.extend(files)
except FileNotFoundError as e:
click.echo(click.style(f" -> Skipping path {storage_path} as it does not exist.", fg="yellow"))
continue
except Exception as e:
click.echo(click.style(f" -> Error scanning files on storage path {storage_path}: {str(e)}", fg="red"))
continue
click.echo(click.style(f"Found {len(all_files_on_storage)} files on storage.", fg="white"))
# find orphaned files
orphaned_files = list(set(all_files_on_storage) - set(all_files_in_tables))
if not orphaned_files:
click.echo(click.style("No orphaned files found. There is nothing to remove.", fg="green"))
return
click.echo(click.style(f"Found {len(orphaned_files)} orphaned files.", fg="white"))
for file in orphaned_files:
click.echo(click.style(f"- orphaned file: {file}", fg="black"))
click.confirm(f"Do you want to proceed to remove all {len(orphaned_files)} orphaned files?", abort=True)
# delete orphaned files
removed_files = 0
error_files = 0
for file in orphaned_files:
try:
storage.delete(file)
removed_files += 1
click.echo(click.style(f"- Removing orphaned file: {file}", fg="white"))
except Exception as e:
error_files += 1
click.echo(click.style(f"- Error deleting orphaned file {file}: {str(e)}", fg="red"))
continue
if error_files == 0:
click.echo(click.style(f"Removed {removed_files} orphaned files without errors.", fg="green"))
else:
click.echo(click.style(f"Removed {removed_files} orphaned files, with {error_files} errors.", fg="yellow"))

View File

@ -5,6 +5,7 @@ def init_app(app: DifyApp):
from commands import ( from commands import (
add_qdrant_index, add_qdrant_index,
clear_free_plan_tenant_expired_logs, clear_free_plan_tenant_expired_logs,
clear_orphaned_file_records,
convert_to_agent_apps, convert_to_agent_apps,
create_tenant, create_tenant,
extract_plugins, extract_plugins,
@ -13,6 +14,7 @@ def init_app(app: DifyApp):
install_plugins, install_plugins,
migrate_data_for_plugin, migrate_data_for_plugin,
old_metadata_migration, old_metadata_migration,
remove_orphaned_files_on_storage,
reset_email, reset_email,
reset_encrypt_key_pair, reset_encrypt_key_pair,
reset_password, reset_password,
@ -36,6 +38,8 @@ def init_app(app: DifyApp):
install_plugins, install_plugins,
old_metadata_migration, old_metadata_migration,
clear_free_plan_tenant_expired_logs, clear_free_plan_tenant_expired_logs,
clear_orphaned_file_records,
remove_orphaned_files_on_storage,
] ]
for cmd in cmds_to_register: for cmd in cmds_to_register:
app.cli.add_command(cmd) app.cli.add_command(cmd)

View File

@ -102,6 +102,9 @@ class Storage:
def delete(self, filename): def delete(self, filename):
return self.storage_runner.delete(filename) return self.storage_runner.delete(filename)
def scan(self, path: str, files: bool = True, directories: bool = False) -> list[str]:
return self.storage_runner.scan(path, files=files, directories=directories)
storage = Storage() storage = Storage()

View File

@ -30,3 +30,11 @@ class BaseStorage(ABC):
@abstractmethod @abstractmethod
def delete(self, filename): def delete(self, filename):
raise NotImplementedError raise NotImplementedError
def scan(self, path, files=True, directories=False) -> list[str]:
"""
Scan files and directories in the given path.
This method is implemented only in some storage backends.
If a storage backend doesn't support scanning, it will raise NotImplementedError.
"""
raise NotImplementedError("This storage backend doesn't support scanning")

View File

@ -80,3 +80,20 @@ class OpenDALStorage(BaseStorage):
logger.debug(f"file {filename} deleted") logger.debug(f"file {filename} deleted")
return return
logger.debug(f"file {filename} not found, skip delete") logger.debug(f"file {filename} not found, skip delete")
def scan(self, path: str, files: bool = True, directories: bool = False) -> list[str]:
if not self.exists(path):
raise FileNotFoundError("Path not found")
all_files = self.op.scan(path=path)
if files and directories:
logger.debug(f"files and directories on {path} scanned")
return [f.path for f in all_files]
if files:
logger.debug(f"files on {path} scanned")
return [f.path for f in all_files if not f.path.endswith("/")]
elif directories:
logger.debug(f"directories on {path} scanned")
return [f.path for f in all_files if f.path.endswith("/")]
else:
raise ValueError("At least one of files or directories must be True")