From 8a2542157fc22fd49daad465704aa1ab78dc87ba Mon Sep 17 00:00:00 2001 From: yihong Date: Mon, 3 Mar 2025 10:26:45 +0800 Subject: [PATCH] Fix: possible memory leaks close #5277 (#5500) ### What problem does this PR solve? close #5277 by make sure the file close ### Type of change - [x] Performance Improvement --------- Signed-off-by: yihong0618 --- api/utils/file_utils.py | 1 + deepdoc/parser/pdf_parser.py | 7 ++++++- deepdoc/vision/__init__.py | 1 + rag/svr/task_executor.py | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index ed014ac0e..d76be81c9 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -188,6 +188,7 @@ def thumbnail_img(filename, blob): buffered = BytesIO() else: break + pdf.close() return img elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename): diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 2282a87fb..d4645f8a9 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -950,7 +950,9 @@ class RAGFlowPdfParser: try: pdf = pdfplumber.open( fnm) if not binary else pdfplumber.open(BytesIO(binary)) - return len(pdf.pages) + total_page = len(pdf.pages) + pdf.close() + return total_page except Exception: logging.exception("total_page_number") @@ -996,8 +998,11 @@ class RAGFlowPdfParser: dfs(outlines, 0) except Exception as e: logging.warning(f"Outlines exception: {e}") + finally: + self.pdf.close() if not self.outlines: logging.warning("Miss outlines") + logging.debug("Images converted.") self.is_english = [re.search(r"[a-zA-Z0-9,/ΒΈ;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join( diff --git a/deepdoc/vision/__init__.py b/deepdoc/vision/__init__.py index 64afcaf72..c67178c8f 100644 --- a/deepdoc/vision/__init__.py +++ b/deepdoc/vision/__init__.py @@ -42,6 +42,7 @@ def init_in_out(args): for i, page in enumerate(images): outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") + pdf.close() def images_and_outputs(fnm): nonlocal outputs, images diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 3fc69f3f5..05388f8df 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -18,6 +18,7 @@ # beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code import random import sys + from api.utils.log_utils import initRootLogger, get_project_base_directory from graphrag.general.index import WithCommunity, WithResolution, Dealer from graphrag.light.graph_extractor import GraphExtractor as LightKGExt