From 4230402fbb74b215b49326ec0c7a9812ca3695ff Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Fri, 24 Jan 2025 09:48:02 +0800 Subject: [PATCH] deepdoc use GPU if possible (#4618) ### What problem does this PR solve? deepdoc use GPU if possible ### Type of change - [x] Refactoring --- deepdoc/vision/ocr.py | 30 +++++++++++++++++++++++------- deepdoc/vision/recognizer.py | 23 ++++++++++++++++++++--- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py index 9d9d44cde..4150adad8 100644 --- a/deepdoc/vision/ocr.py +++ b/deepdoc/vision/ocr.py @@ -14,6 +14,7 @@ # limitations under the License. # +import logging import copy import time import os @@ -75,17 +76,32 @@ def load_model(model_dir, nm): options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL options.intra_op_num_threads = 2 options.inter_op_num_threads = 2 - if False and ort.get_device() == "GPU": + + # https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580 + # Shrink GPU memory after execution + run_options = ort.RunOptions() + if ort.get_device() == "GPU": + cuda_provider_options = { + "device_id": 0, # Use specific GPU + "gpu_mem_limit": 512 * 1024 * 1024, # Limit gpu memory + "arena_extend_strategy": "kNextPowerOfTwo", # gpu memory allocation strategy + } sess = ort.InferenceSession( model_file_path, options=options, - providers=['CUDAExecutionProvider']) + providers=['CUDAExecutionProvider'], + provider_options=[cuda_provider_options] + ) + run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "gpu:0") + logging.info(f"TextRecognizer {nm} uses GPU") else: sess = ort.InferenceSession( model_file_path, options=options, providers=['CPUExecutionProvider']) - return sess, sess.get_inputs()[0] + run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu") + logging.info(f"TextRecognizer {nm} uses CPU") + return sess, sess.get_inputs()[0], run_options class TextRecognizer(object): @@ -98,7 +114,7 @@ class TextRecognizer(object): "use_space_char": True } self.postprocess_op = build_post_process(postprocess_params) - self.predictor, self.input_tensor = load_model(model_dir, 'rec') + self.predictor, self.input_tensor, self.run_options = load_model(model_dir, 'rec') def resize_norm_img(self, img, max_wh_ratio): imgC, imgH, imgW = self.rec_image_shape @@ -344,7 +360,7 @@ class TextRecognizer(object): input_dict[self.input_tensor.name] = norm_img_batch for i in range(100000): try: - outputs = self.predictor.run(None, input_dict) + outputs = self.predictor.run(None, input_dict, self.run_options) break except Exception as e: if i >= 3: @@ -383,7 +399,7 @@ class TextDetector(object): "unclip_ratio": 1.5, "use_dilation": False, "score_mode": "fast", "box_type": "quad"} self.postprocess_op = build_post_process(postprocess_params) - self.predictor, self.input_tensor = load_model(model_dir, 'det') + self.predictor, self.input_tensor, self.run_options = load_model(model_dir, 'det') img_h, img_w = self.input_tensor.shape[2:] if isinstance(img_h, str) or isinstance(img_w, str): @@ -456,7 +472,7 @@ class TextDetector(object): input_dict[self.input_tensor.name] = img for i in range(100000): try: - outputs = self.predictor.run(None, input_dict) + outputs = self.predictor.run(None, input_dict, self.run_options) break except Exception as e: if i >= 3: diff --git a/deepdoc/vision/recognizer.py b/deepdoc/vision/recognizer.py index aeef52d57..54ba68d65 100644 --- a/deepdoc/vision/recognizer.py +++ b/deepdoc/vision/recognizer.py @@ -60,12 +60,29 @@ class Recognizer(object): if not os.path.exists(model_file_path): raise ValueError("not find model file path {}".format( model_file_path)) - if False and ort.get_device() == "GPU": + # https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580 + # Shrink GPU memory after execution + self.run_options = ort.RunOptions() + + if ort.get_device() == "GPU": options = ort.SessionOptions() options.enable_cpu_mem_arena = False - self.ort_sess = ort.InferenceSession(model_file_path, options=options, providers=[('CUDAExecutionProvider')]) + cuda_provider_options = { + "device_id": 0, # Use specific GPU + "gpu_mem_limit": 512 * 1024 * 1024, # Limit gpu memory + "arena_extend_strategy": "kNextPowerOfTwo", # gpu memory allocation strategy + } + self.ort_sess = ort.InferenceSession( + model_file_path, options=options, + providers=['CUDAExecutionProvider'], + provider_options=[cuda_provider_options] + ) + self.run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "gpu:0") + logging.info(f"Recognizer {task_name} uses GPU") else: self.ort_sess = ort.InferenceSession(model_file_path, providers=['CPUExecutionProvider']) + self.run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "cpu") + logging.info(f"Recognizer {task_name} uses CPU") self.input_names = [node.name for node in self.ort_sess.get_inputs()] self.output_names = [node.name for node in self.ort_sess.get_outputs()] self.input_shape = self.ort_sess.get_inputs()[0].shape[2:4] @@ -454,7 +471,7 @@ class Recognizer(object): inputs = self.preprocess(batch_image_list) logging.debug("preprocess") for ins in inputs: - bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names})[0], ins, thr) + bb = self.postprocess(self.ort_sess.run(None, {k:v for k,v in ins.items() if k in self.input_names}, self.run_options)[0], ins, thr) res.append(bb) #seeit.save_results(image_list, res, self.label_list, threshold=thr)