fix: resolve residual image files issue after document deletion (#7964)

### What problem does this PR solve?

When deleting knowledge base documents in RAGFlow, the current process
only removes the block texts in Elasticsearch and the original files in
MinIO, but it leaves behind many binary images and thumbnails generated
during chunking. This pull request improves the deletion process by
querying the block information in Elasticsearch to ensure a more
thorough and complete cleanup.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Qidi Cao 2025-05-30 12:56:33 +08:00 committed by GitHub
parent 02db995e94
commit f0879563d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -27,6 +27,7 @@ import xxhash
from peewee import fn
from api import settings
from api.constants import IMG_BASE64_PREFIX
from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole
from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant
from api.db.db_utils import bulk_insert_into_db
@ -147,7 +148,26 @@ class DocumentService(CommonService):
def remove_document(cls, doc, tenant_id):
cls.clear_chunk_num(doc.id)
try:
page = 0
page_size = 1000
all_chunk_ids = []
while True:
chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(),
page * page_size, page_size, search.index_name(tenant_id),
[doc.kb_id])
chunk_ids = settings.docStoreConn.getChunkIds(chunks)
if not chunk_ids:
break
all_chunk_ids.extend(chunk_ids)
page += 1
for cid in all_chunk_ids:
if STORAGE_IMPL.obj_exist(doc.kb_id, cid):
STORAGE_IMPL.rm(doc.kb_id, cid)
if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX):
if STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail):
STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail)
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
graph_source = settings.docStoreConn.getFields(
settings.docStoreConn.search(["source_id"], [], {"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, [], OrderByExpr(), 0, 1, search.index_name(tenant_id), [doc.kb_id]), ["source_id"]
)