From f0879563d0e350a8bf19ce40872adc5a9c51775d Mon Sep 17 00:00:00 2001 From: Qidi Cao <105201479+XiaoCaoAskedForHelp@users.noreply.github.com> Date: Fri, 30 May 2025 12:56:33 +0800 Subject: [PATCH] fix: resolve residual image files issue after document deletion (#7964) ### What problem does this PR solve? When deleting knowledge base documents in RAGFlow, the current process only removes the block texts in Elasticsearch and the original files in MinIO, but it leaves behind many binary images and thumbnails generated during chunking. This pull request improves the deletion process by querying the block information in Elasticsearch to ensure a more thorough and complete cleanup. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/db/services/document_service.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 284df853a..3bc736e66 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -27,6 +27,7 @@ import xxhash from peewee import fn from api import settings +from api.constants import IMG_BASE64_PREFIX from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant from api.db.db_utils import bulk_insert_into_db @@ -147,7 +148,26 @@ class DocumentService(CommonService): def remove_document(cls, doc, tenant_id): cls.clear_chunk_num(doc.id) try: + page = 0 + page_size = 1000 + all_chunk_ids = [] + while True: + chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(), + page * page_size, page_size, search.index_name(tenant_id), + [doc.kb_id]) + chunk_ids = settings.docStoreConn.getChunkIds(chunks) + if not chunk_ids: + break + all_chunk_ids.extend(chunk_ids) + page += 1 + for cid in all_chunk_ids: + if STORAGE_IMPL.obj_exist(doc.kb_id, cid): + STORAGE_IMPL.rm(doc.kb_id, cid) + if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX): + if STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail): + STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail) settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id) + graph_source = settings.docStoreConn.getFields( settings.docStoreConn.search(["source_id"], [], {"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, [], OrderByExpr(), 0, 1, search.index_name(tenant_id), [doc.kb_id]), ["source_id"] )