Fix: possible memory leaks close #5277 (#5500)

### What problem does this PR solve?

close #5277 by make sure the file close

### Type of change

- [x] Performance Improvement

---------

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong 2025-03-03 10:26:45 +08:00 committed by GitHub
parent d6836444c9
commit 8a2542157f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 9 additions and 1 deletions

View File

@ -188,6 +188,7 @@ def thumbnail_img(filename, blob):
buffered = BytesIO()
else:
break
pdf.close()
return img
elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):

View File

@ -950,7 +950,9 @@ class RAGFlowPdfParser:
try:
pdf = pdfplumber.open(
fnm) if not binary else pdfplumber.open(BytesIO(binary))
return len(pdf.pages)
total_page = len(pdf.pages)
pdf.close()
return total_page
except Exception:
logging.exception("total_page_number")
@ -996,9 +998,12 @@ class RAGFlowPdfParser:
dfs(outlines, 0)
except Exception as e:
logging.warning(f"Outlines exception: {e}")
finally:
self.pdf.close()
if not self.outlines:
logging.warning("Miss outlines")
logging.debug("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in

View File

@ -42,6 +42,7 @@ def init_in_out(args):
for i, page in enumerate(images):
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
pdf.close()
def images_and_outputs(fnm):
nonlocal outputs, images

View File

@ -18,6 +18,7 @@
# beartype_all(conf=BeartypeConf(violation_type=UserWarning)) # <-- emit warnings from all code
import random
import sys
from api.utils.log_utils import initRootLogger, get_project_base_directory
from graphrag.general.index import WithCommunity, WithResolution, Dealer
from graphrag.light.graph_extractor import GraphExtractor as LightKGExt