mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-10 16:59:01 +08:00
fix: resolve residual image files issue after document deletion (#7964)
### What problem does this PR solve? When deleting knowledge base documents in RAGFlow, the current process only removes the block texts in Elasticsearch and the original files in MinIO, but it leaves behind many binary images and thumbnails generated during chunking. This pull request improves the deletion process by querying the block information in Elasticsearch to ensure a more thorough and complete cleanup. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
02db995e94
commit
f0879563d0
@ -27,6 +27,7 @@ import xxhash
|
||||
from peewee import fn
|
||||
|
||||
from api import settings
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole
|
||||
from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant
|
||||
from api.db.db_utils import bulk_insert_into_db
|
||||
@ -147,7 +148,26 @@ class DocumentService(CommonService):
|
||||
def remove_document(cls, doc, tenant_id):
|
||||
cls.clear_chunk_num(doc.id)
|
||||
try:
|
||||
page = 0
|
||||
page_size = 1000
|
||||
all_chunk_ids = []
|
||||
while True:
|
||||
chunks = settings.docStoreConn.search(["img_id"], [], {"doc_id": doc.id}, [], OrderByExpr(),
|
||||
page * page_size, page_size, search.index_name(tenant_id),
|
||||
[doc.kb_id])
|
||||
chunk_ids = settings.docStoreConn.getChunkIds(chunks)
|
||||
if not chunk_ids:
|
||||
break
|
||||
all_chunk_ids.extend(chunk_ids)
|
||||
page += 1
|
||||
for cid in all_chunk_ids:
|
||||
if STORAGE_IMPL.obj_exist(doc.kb_id, cid):
|
||||
STORAGE_IMPL.rm(doc.kb_id, cid)
|
||||
if doc.thumbnail and not doc.thumbnail.startswith(IMG_BASE64_PREFIX):
|
||||
if STORAGE_IMPL.obj_exist(doc.kb_id, doc.thumbnail):
|
||||
STORAGE_IMPL.rm(doc.kb_id, doc.thumbnail)
|
||||
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
|
||||
|
||||
graph_source = settings.docStoreConn.getFields(
|
||||
settings.docStoreConn.search(["source_id"], [], {"kb_id": doc.kb_id, "knowledge_graph_kwd": ["graph"]}, [], OrderByExpr(), 0, 1, search.index_name(tenant_id), [doc.kb_id]), ["source_id"]
|
||||
)
|
||||
|
Loading…
x
Reference in New Issue
Block a user